diff --git a/.github/workflows/PSDB-amd-mainline.yml b/.github/workflows/PSDB-amd-mainline.yml index 31890d7200f16..35a3095419194 100644 --- a/.github/workflows/PSDB-amd-mainline.yml +++ b/.github/workflows/PSDB-amd-mainline.yml @@ -1,5 +1,4 @@ -# This workflow is used to invoke the PSDB jenkins job for ROCm Compiler CI. The python script can be used to invoke any jenkins job but input params needs to be configured properly -name: Compiler CI test +name: Compiler CI PSDB trigger on amd-mainline branch # Controls when the workflow will run on: @@ -10,46 +9,64 @@ on: # A workflow run is made up of one or more jobs that can run sequentially or in parallel, below is a single job called invoke jenkins jobs jobs: - # This workflow contains a single job called "build" + # This workflow contains a single job called "invoke_jenkins_PSDB" invoke_jenkins_PSDB: - # The type of runner that the job will run on. For github hosted runner use (${{ 'ubuntu-latest' }}) or self-hosted for sel-hosted runner. - #runs-on: ubuntu-latest if: github.event.pull_request.draft == false - runs-on: self-hosted - container: - image: compute-artifactory.amd.com:5000/rocm-base-images/ghemu-action-ubuntu-24.04:2024101101 + runs-on: + group: compiler-generic-runners env: - svc_acc_org_secret: ${{'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'}} + svc_acc_org_secret: ${{secrets.CI_GITHUB_TOKEN}} input_sha: ${{ github.event.pull_request.head.sha != '' && github.event.pull_request.head.sha || github.sha }} input_pr_num: ${{ github.event.pull_request.number != '' && github.event.pull_request.number || 0 }} input_pr_url: ${{ github.event.pull_request.html_url != '' && github.event.pull_request.html_url || '' }} input_pr_title: ${{ github.event.pull_request.title != '' && github.event.pull_request.title || '' }} # set the pipeline name here based on branch name - pipeline_name: ${{ 'compiler-psdb-amd-mainline' }} - JENKINS_URL: ${{'https://compiler-ci.amd.com/'}} + pipeline_name: ${{secrets.CI_JENKINS_MAINLINE_JOB_NAME}} + JENKINS_URL: ${{secrets.CI_JENKINS_URL}} + CONTAINER_IMAGE: ${{ secrets.JENKINS_TRIGGER_DOCKER_IMAGE }} + # Steps represent a sequence of tasks that will be executed as part of the job - steps: - # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - - name: Check out rocm_ci_infra private repo - uses: actions/checkout@main - with: - #ref: ci-utils-dev-siva - #fetch-depth: 2 - repository: AMD-Lightning-Internal/ci-utils - token: ${{ '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' }} - #token: ${{'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'}} + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - name: Set environment variable for container image + run: | + echo "CONTAINER_IMAGE=${{ secrets.JENKINS_TRIGGER_DOCKER_IMAGE }}" >> $GITHUB_ENV + echo "CONTAINER_NAME=my_container_${{ github.run_id }}" >> $GITHUB_ENV + + + - name: Pull container image + run: docker pull "${{env.CONTAINER_IMAGE}}" + + - name: Run container + run: | + docker run -d --name "${{env.CONTAINER_NAME}}" $CONTAINER_IMAGE sleep infinity + #docker exec "${{env.CONTAINER_NAME}}" /bin/bash -c "git clone ${{secrets.CI_UTILS_REPO}} ." + docker exec "${{env.CONTAINER_NAME}}" /bin/bash -c "echo 'Running commands inside the container'" + + - name: Escape pull request title + run: | + import json + import os + import shlex + with open('${{ github.event_path }}') as fh: + event = json.load(fh) + escaped = event['pull_request']['title'] + with open(os.environ['GITHUB_ENV'], 'a') as fh: + print(f'PR_TITLE={escaped}', file=fh) + shell: python3 {0} + - name: Run Jenkins Cancel Script env: - JENKINS_URL: ${{ 'https://compiler-ci.amd.com/' }} - JENKINS_USER: ${{ 'z1_cciauto' }} - JENKINS_API_TOKEN: ${{ '11bdb3dcd61f1a00f3999c8e3a0d6da9a7' }} - JENKINS_JOB_NAME: ${{ 'compiler-psdb-amd-mainline' }} + JENKINS_URL: ${{secrets.CI_JENKINS_URL}} + JENKINS_USER: ${{secrets.CI_JENKINS_USER}} + JENKINS_API_TOKEN: ${{secrets.CI_JENKINS_TOKEN}} + JENKINS_JOB_NAME: ${{secrets.CI_JENKINS_JOB_NAME}} PR_NUMBER: ${{ github.event.pull_request.number }} COMMIT_HASH: ${{ github.event.after }} run: | - python3 cancel_previous_build.py + docker exec -e JENKINS_JOB_NAME=${{secrets.CI_JENKINS_JOB_NAME}} -e PR_NUMBER=${{ github.event.pull_request.number }} -e COMMIT_HASH=${{ github.event.after }} -e JENKINS_URL=${{secrets.CI_JENKINS_URL}} -e JENKINS_USER=${{secrets.CI_JENKINS_USER}} -e JENKINS_API_TOKEN=${{secrets.CI_JENKINS_TOKEN}} "${{env.CONTAINER_NAME}}" /bin/bash -c "PYTHONHTTPSVERIFY=0 python3 cancel_previous_build.py" # Runs a set of commands using the runners shell @@ -65,21 +82,25 @@ jobs: echo "GITHUB_REF_NAME is: $GITHUB_REF_NAME" echo "github.event.pull_request.id is: ${{github.event.pull_request.id}}" echo "github.event.pull_request.html_url is: ${{github.event.pull_request.html_url}}" - echo "github.event.pull_request.number is: ${{github.event.pull_request.number}}" + echo "github.event.pull_request.number is: ${{github.event.pull_request.number}}" echo "github.event.pull_request.url is: ${{github.event.pull_request.url}}" echo "github.event.pull_request.issue_url is: ${{github.event.pull_request.issue_url}}" - echo "github.event.pull_request.comments_url is: ${{github.event.pull_request.comments_url}}" - echo "github.event.pull_request.statuses_url is: ${{github.event.pull_request.statuses_url}}" echo "github.event.pull_request.head.sha is: ${{github.event.pull_request.head.sha}}" echo "github.event.pull_request.base.ref is: ${{github.event.pull_request.base.ref}}" echo "github.event.pull_request.merge_commit_sha is: ${{github.event.pull_request.merge_commit_sha}}" echo "github.event.pull_request is: ${{github.event.pull_request}}" - pip3 show python-jenkins || echo "python-jenkins is not installed" - #sudo -H pip3 install --upgrade python-jenkins - # pipeline name shuould be unique to the workfow yml for a given repository - #curl -L -X POST -H "Accept: application/vnd.github+json" -H "Authorization: Bearer ${{'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'}}" ${{github.event.pull_request.comments_url}} -d '{"body":"Github action triggered jenkins job for compute-psdb-staging-smi-libs-ghemu "}' + + - name: Trigger Jenkins Pipeline if: steps.check_changes.outcome != 'failure' run: | - echo "running jenkins_api.py with input sha - $input_sha for pull request - $input_pr_url" - python3 jenkins_api.py -s $JENKINS_URL -jn $pipeline_name -ghr $GITHUB_REPOSITORY -ghsha $input_sha -ghprn $input_pr_num -ghpru "$input_pr_url" -ghprt "$input_pr_title" -ghpat $svc_acc_org_secret + echo "--Running jenkins_api.py with input sha - $input_sha for pull request - $input_pr_url" + docker exec -e GITHUB_REPOSITORY="$GITHUB_REPOSITORY" -e svc_acc_org_secret="$svc_acc_org_secret" -e input_sha="$input_sha" -e input_pr_url="$input_pr_url" -e pipeline_name="$pipeline_name" \ + -e input_pr_num="$input_pr_num" -e PR_TITLE="$PR_TITLE" -e JENKINS_URL="$JENKINS_URL" -e GITHUB_PAT="$svc_acc_org_secret" "${{env.CONTAINER_NAME}}" \ + /bin/bash -c 'echo \"PR NUM: "$input_pr_num"\" && PYTHONHTTPSVERIFY=0 python3 jenkins_api.py -s \"${JENKINS_URL}\" -jn "$pipeline_name" -ghr "$GITHUB_REPOSITORY" -ghsha "$input_sha" -ghprn "$input_pr_num" -ghpru "$input_pr_url" -ghprt "$PR_TITLE" -ghpat="$svc_acc_org_secret"' + + - name: Stop and remove container + if: always() + run: | + docker stop "${{env.CONTAINER_NAME}}" + docker rm "${{env.CONTAINER_NAME}}" diff --git a/.github/workflows/PSDB-amd-staging.yml b/.github/workflows/PSDB-amd-staging.yml deleted file mode 100644 index 38edc7adb8339..0000000000000 --- a/.github/workflows/PSDB-amd-staging.yml +++ /dev/null @@ -1,85 +0,0 @@ -# This workflow is used to invoke the PSDB jenkins job for ROCm Compiler CI. The python script can be used to invoke any jenkins job but input params needs to be configured properly -name: Compiler CI amd-staging branch trigger - -# Controls when the workflow will run -on: - pull_request: - branches: [amd-staging] - types: [opened, reopened, synchronize, ready_for_review] - workflow_dispatch: - -# A workflow run is made up of one or more jobs that can run sequentially or in parallel, below is a single job called invoke jenkins jobs -jobs: - # This workflow contains a single job called "build" - invoke_jenkins_PSDB: - # The type of runner that the job will run on. For github hosted runner use (${{ 'ubuntu-latest' }}) or self-hosted for sel-hosted runner. - #runs-on: ubuntu-latest - if: github.event.pull_request.draft == false - runs-on: self-hosted - container: - image: compute-artifactory.amd.com:5000/rocm-base-images/ghemu-action-ubuntu-24.04:2024101101 - env: - svc_acc_org_secret: ${{'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'}} - input_sha: ${{ github.event.pull_request.head.sha != '' && github.event.pull_request.head.sha || github.sha }} - input_pr_num: ${{ github.event.pull_request.number != '' && github.event.pull_request.number || 0 }} - input_pr_url: ${{ github.event.pull_request.html_url != '' && github.event.pull_request.html_url || '' }} - input_pr_title: ${{ github.event.pull_request.title != '' && github.event.pull_request.title || '' }} - # set the pipeline name here based on branch name - pipeline_name: ${{ 'compiler-psdb-amd-staging' }} - JENKINS_URL: ${{'https://compiler-ci.amd.com/'}} - # Steps represent a sequence of tasks that will be executed as part of the job - steps: - # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - - name: Check out rocm_ci_infra private repo - uses: actions/checkout@main - with: - #ref: ci-utils-dev-siva - #fetch-depth: 2 - repository: AMD-Lightning-Internal/ci-utils - token: ${{ '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' }} - #token: ${{'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'}} - - - - name: Run Jenkins Cancel Script - env: - JENKINS_URL: ${{ 'https://compiler-ci.amd.com/' }} - JENKINS_USER: ${{ 'z1_cciauto' }} - JENKINS_API_TOKEN: ${{ '11bdb3dcd61f1a00f3999c8e3a0d6da9a7' }} - JENKINS_JOB_NAME: ${{ 'compiler-psdb-amd-staging' }} - PR_NUMBER: ${{ github.event.pull_request.number }} - COMMIT_HASH: ${{ github.event.after }} - run: | - python3 cancel_previous_build.py - - - # Runs a set of commands using the runners shell - - name: Getting Event Details - run: | - echo $(pwd) - echo $GITHUB_ENV - echo $GITHUB_REPOSITORY - echo $GITHUB_SERVER_URL - echo "GITHUB_SHA is: $GITHUB_SHA" - echo "GITHUB_WORKFLOW_SHA is: $GITHUB_WORKFLOW_SHA" - echo "GITHUB_BASE_REF is: $GITHUB_BASE_REF" - echo "GITHUB_REF_NAME is: $GITHUB_REF_NAME" - echo "github.event.pull_request.id is: ${{github.event.pull_request.id}}" - echo "github.event.pull_request.html_url is: ${{github.event.pull_request.html_url}}" - echo "github.event.pull_request.number is: ${{github.event.pull_request.number}}" - echo "github.event.pull_request.url is: ${{github.event.pull_request.url}}" - echo "github.event.pull_request.issue_url is: ${{github.event.pull_request.issue_url}}" - echo "github.event.pull_request.comments_url is: ${{github.event.pull_request.comments_url}}" - echo "github.event.pull_request.statuses_url is: ${{github.event.pull_request.statuses_url}}" - echo "github.event.pull_request.head.sha is: ${{github.event.pull_request.head.sha}}" - echo "github.event.pull_request.base.ref is: ${{github.event.pull_request.base.ref}}" - echo "github.event.pull_request.merge_commit_sha is: ${{github.event.pull_request.merge_commit_sha}}" - echo "github.event.pull_request is: ${{github.event.pull_request}}" - pip3 show python-jenkins || echo "python-jenkins is not installed" - #sudo -H pip3 install --upgrade python-jenkins - # pipeline name shuould be unique to the workfow yml for a given repository - #curl -L -X POST -H "Accept: application/vnd.github+json" -H "Authorization: Bearer ${{'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'}}" ${{github.event.pull_request.comments_url}} -d '{"body":"Github action triggered jenkins job for compute-psdb-staging-smi-libs-ghemu "}' - - name: Trigger Jenkins Pipeline - if: steps.check_changes.outcome != 'failure' - run: | - echo "running jenkins_api.py with input sha - $input_sha for pull request - $input_pr_url" - python3 jenkins_api.py -s $JENKINS_URL -jn $pipeline_name -ghr $GITHUB_REPOSITORY -ghsha $input_sha -ghprn $input_pr_num -ghpru "$input_pr_url" -ghprt "$input_pr_title" -ghpat $svc_acc_org_secret diff --git a/amd/comgr/CMakeLists.txt b/amd/comgr/CMakeLists.txt index 0c7a836e22941..2eeb8c2ac6ac2 100644 --- a/amd/comgr/CMakeLists.txt +++ b/amd/comgr/CMakeLists.txt @@ -38,7 +38,6 @@ endif(EXISTS "${CMAKE_SOURCE_DIR}/../../.git") include(GNUInstallDirs) include(CMakePackageConfigHelpers) -option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorg with backward compatibility" OFF) # Optionally, build Compiler Support with ccache. set(ROCM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build") if (ROCM_CCACHE_BUILD) @@ -71,6 +70,10 @@ option(COMGR_BUILD_SHARED_LIBS "Build the shared library" ${build_shared_libs_default}) set(SOURCES + src/comgr-cache.cpp + src/comgr-cache-command.cpp + src/comgr-cache-bundler-command.cpp + src/comgr-clang-command.cpp src/comgr-compiler.cpp src/comgr.cpp src/comgr-device-libs.cpp @@ -81,6 +84,7 @@ set(SOURCES src/comgr-metadata.cpp src/comgr-objdump.cpp src/comgr-signal.cpp + src/comgr-spirv-command.cpp src/comgr-symbol.cpp src/comgr-symbolizer.cpp src/time-stat/time-stat.cpp) @@ -178,9 +182,12 @@ message("") option(COMGR_DISABLE_SPIRV "To disable SPIRV in Comgr" OFF) if (NOT COMGR_DISABLE_SPIRV) - CHECK_INCLUDE_FILE_CXX(LLVMSPIRVLib/LLVMSPIRVLib.h HAVE_LLVMSPIRVLIB_H) - if (NOT HAVE_LLVMSPIRVLIB_H) + # TODO: Explore switching this to CHECK_INCLUDE_FILE_CXX() macro + if (NOT EXISTS "${LLVM_INCLUDE_DIRS}/LLVMSPIRVLib/LLVMSPIRVLib.h") + message("-- LLVMSPIRVLib/LLVMSPIRVLib.h not found") set(COMGR_DISABLE_SPIRV ON) + else() + message("-- LLVMSPIRVLib/LLVMSPIRVLib.h found") endif() endif() @@ -319,26 +326,6 @@ install(FILES COMPONENT amd-comgr DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${amd_comgr_NAME}) -#File reorg Backward compatibility function -if(FILE_REORG_BACKWARD_COMPATIBILITY) -# To enable/disable #error in wrapper header files - if(NOT DEFINED ROCM_HEADER_WRAPPER_WERROR) - if(DEFINED ENV{ROCM_HEADER_WRAPPER_WERROR}) - set(ROCM_HEADER_WRAPPER_WERROR "$ENV{ROCM_HEADER_WRAPPER_WERROR}" - CACHE STRING "Header wrapper warnings as errors.") - else() - set(ROCM_HEADER_WRAPPER_WERROR "OFF" CACHE STRING "Header wrapper warnings as errors.") - endif() - endif() - if(ROCM_HEADER_WRAPPER_WERROR) - set(deprecated_error 1) - else() - set(deprecated_error 0) - endif() - - include(comgr-backward-compat.cmake) -endif() - if(ENABLE_ASAN_PACKAGING) install(FILES "LICENSE.txt" diff --git a/amd/comgr/README.md b/amd/comgr/README.md index 686f5c96c890c..7f8985431b9af 100644 --- a/amd/comgr/README.md +++ b/amd/comgr/README.md @@ -125,7 +125,27 @@ These include: certain runtime headers. If this is not set, it has a default value of "${ROCM_PATH}/llvm". -Comgr also supports some environment variables to aid in debugging. These +Comgr utilizes a cache to preserve the results of compilations between executions. +The cache's status (enabled/disabled), storage location for its results, +and eviction policy can be manipulated through specific environment variables. +If an issue arises during cache initialization, the execution will proceed with +the cache turned off. + +By default, the cache is enabled. + +* `AMD_COMGR_CACHE`: When unset or set to a value different than "0", the cache is enabled. + Disabled when set to "0". +* `AMD_COMGR_CACHE_DIR`: If assigned a non-empty value, that value is used as + the path for cache storage. If the variable is unset or set to an empty string `""`, + it is directed to "$XDG_CACHE_HOME/comgr" (which defaults to + "$USER/.cache/comgr" on Linux, and "%LOCALAPPDATA%\cache\comgr" + on Microsoft Windows). +* `AMD_COMGR_CACHE_POLICY`: If assigned a value, the string is interpreted and + applied to the cache pruning policy. The cache is pruned only upon program + termination. The string format aligns with [Clang's ThinLTO cache pruning policy](https://clang.llvm.org/docs/ThinLTO.html#cache-pruning). + The default policy is set as: "prune_interval=1h:prune_expiration=0h:cache_size=75%:cache_size_bytes=30g:cache_size_files=0". + +Comgr supports some environment variables to aid in debugging. These include: * `AMD_COMGR_SAVE_TEMPS`: If this is set, and is not "0", Comgr does not delete @@ -143,6 +163,20 @@ include: * `AMD_COMGR_TIME_STATISTICS`: If this is set, and is not "0", logs will include additional Comgr-specific timing information for compilation actions. +Comgr implements support for an in-memory, virtual filesystem (VFS) for storing +temporaries generated during intermediate compilation steps. This is aimed at +improving performance by reducing on-disk file I/O. Currently, VFS is only supported +for the device library link step, but we aim to progressively add support for +more actions. + +By default, VFS is turned on. + +* `AMD_COMGR_USE_VFS`: When set to "0", VFS support is turned off. +* Users may use the API `amd_comgr_action_info_set_vfs` to disable VFS for individual actions + without having to modify system-wide environment variables. +* If `AMD_COMGR_SAVE_TEMPS` is set and not "0", VFS support is turned off irrespective + of `AMD_COMGR_USE_VFS` or the use of `amd_comgr_action_info_set_vfs`. + Versioning ---------- diff --git a/amd/comgr/VERSION.txt b/amd/comgr/VERSION.txt index 8fc8c2c51b6ba..785420ec0eec3 100644 --- a/amd/comgr/VERSION.txt +++ b/amd/comgr/VERSION.txt @@ -1,4 +1,4 @@ #COMGR_VERSION_MAJOR -2 +3 #COMGR_VERSION_MINOR -9 +0 diff --git a/amd/comgr/cmake/DeviceLibs.cmake b/amd/comgr/cmake/DeviceLibs.cmake index a07643bd3ae4a..ee811a7bf0654 100644 --- a/amd/comgr/cmake/DeviceLibs.cmake +++ b/amd/comgr/cmake/DeviceLibs.cmake @@ -59,6 +59,7 @@ foreach(AMDGCN_LIB_TARGET ${AMD_DEVICE_LIBS_TARGETS}) add_dependencies(amd_comgr ${AMDGCN_LIB_TARGET}_header) list(APPEND TARGETS_INCLUDES "#include \"${header}\"") + list(APPEND TARGETS_HEADERS "${INC_DIR}/${header}") endforeach() list(JOIN TARGETS_INCLUDES "\n" TARGETS_INCLUDES) @@ -110,4 +111,17 @@ list(APPEND TARGETS_DEFS "#undef AMD_DEVICE_LIBS_FUNCTION") list(JOIN TARGETS_DEFS "\n" TARGETS_DEFS) file(GENERATE OUTPUT ${GEN_LIBRARY_DEFS_INC_FILE} CONTENT "${TARGETS_DEFS}") +# compute the sha256 of the device libraries to detect changes and pass them to comgr (used by the cache) +find_package(Python3 REQUIRED Interpreter) +set(DEVICE_LIBS_ID_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/device-libs-id.py") +set(DEVICE_LIBS_ID_HEADER ${INC_DIR}/libraries_sha.inc) +add_custom_command(OUTPUT ${DEVICE_LIBS_ID_HEADER} + COMMAND ${Python3_EXECUTABLE} ${DEVICE_LIBS_ID_SCRIPT} --varname DEVICE_LIBS_ID --output ${DEVICE_LIBS_ID_HEADER} ${TARGETS_HEADERS} + DEPENDS ${DEVICE_LIBS_ID_SCRIPT} ${TARGETS_HEADERS} + COMMENT "Generating ${INC_DIR}/libraries_sha.inc" +) +set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${INC_DIR}/libraries_sha.inc) +add_custom_target(libraries_sha_header DEPENDS ${INC_DIR}/libraries_sha.inc) +add_dependencies(amd_comgr libraries_sha_header) + include_directories(${INC_DIR}) diff --git a/amd/comgr/cmake/device-libs-id.py b/amd/comgr/cmake/device-libs-id.py new file mode 100644 index 0000000000000..2a27d120dc637 --- /dev/null +++ b/amd/comgr/cmake/device-libs-id.py @@ -0,0 +1,20 @@ +from argparse import ArgumentParser +from hashlib import sha256 +from functools import reduce + +if __name__ == "__main__": + parser = ArgumentParser(description='Generate id by computing a hash of the generated headers') + parser.add_argument("headers", nargs='+', help='List of headers to generate id from') + parser.add_argument("--varname", help='Name of the variable to generate', required=True) + parser.add_argument("--output", help='Name of the header to generate', required=True) + + args = parser.parse_args() + args.headers.sort() + + hash = sha256() + for x in args.headers: + hash.update(open(x, 'rb').read()) + digest_uchar = hash.digest() + digest_char = [e if e < 128 else e-256 for e in digest_uchar] + digest_elts = ", ".join(map(str, digest_char)) + print(f"static const char {args.varname}[] = {{{digest_elts}, 0}};", file=open(args.output, 'w')) diff --git a/amd/comgr/comgr-backward-compat.cmake b/amd/comgr/comgr-backward-compat.cmake deleted file mode 100644 index c4f5992940ad8..0000000000000 --- a/amd/comgr/comgr-backward-compat.cmake +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -cmake_minimum_required(VERSION 3.16.8) - -set(COMGR_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) -set(COMGR_WRAPPER_DIR ${COMGR_BUILD_DIR}/wrapper_dir) -set(COMGR_WRAPPER_INC_DIR ${COMGR_WRAPPER_DIR}/include) - -#Function to generate header template file -function(create_header_template) - file(WRITE ${COMGR_WRAPPER_DIR}/header.hpp.in "/* - Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the \"Software\"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -#ifndef @include_guard@ -#define @include_guard@ - -#ifndef ROCM_HEADER_WRAPPER_WERROR -#define ROCM_HEADER_WRAPPER_WERROR @deprecated_error@ -#endif -#if ROCM_HEADER_WRAPPER_WERROR /* ROCM_HEADER_WRAPPER_WERROR 1 */ -#error \"This file is deprecated. Use file from include path /opt/rocm-ver/include/ and prefix with amd_comgr\" -#else /* ROCM_HEADER_WRAPPER_WERROR 0 */ -#if defined(__GNUC__) -#warning \"This file is deprecated. Use file from include path /opt/rocm-ver/include/ and prefix with amd_comgr\" -#else -#pragma message(\"This file is deprecated. Use file from include path /opt/rocm-ver/include/ and prefix with amd_comgr\") -#endif -#endif /* ROCM_HEADER_WRAPPER_WERROR */ - -@include_statements@ - -#endif") -endfunction() - -#use header template file and generate wrapper header files -function(generate_wrapper_header) - file(MAKE_DIRECTORY ${COMGR_WRAPPER_INC_DIR}) - #find all header files(*.h) from include - file(GLOB include_files ${COMGR_BUILD_DIR}/include/*.h) - #Generate wrapper header files for each files in the list - foreach(header_file ${include_files}) - # set include guard - get_filename_component(INC_GAURD_NAME ${header_file} NAME_WE) - string(TOUPPER ${INC_GAURD_NAME} INC_GAURD_NAME) - set(include_guard "${include_guard}COMGR_WRAPPER_INCLUDE_${INC_GAURD_NAME}_H") - #set #include statement - get_filename_component(file_name ${header_file} NAME) - set(include_statements "${include_statements}#include \"${amd_comgr_NAME}/${file_name}\"\n") - configure_file(${COMGR_WRAPPER_DIR}/header.hpp.in ${COMGR_WRAPPER_INC_DIR}/${file_name}) - unset(include_statements) - unset(include_guard) - endforeach() - -endfunction() - -#Creater a template for header file -create_header_template() -#Use template header file and generater wrapper header files -generate_wrapper_header() -install(DIRECTORY ${COMGR_WRAPPER_INC_DIR} COMPONENT amd-comgr DESTINATION .) diff --git a/amd/comgr/docs/ReleaseNotes.md b/amd/comgr/docs/ReleaseNotes.md index d638838daaf53..434af96e64630 100644 --- a/amd/comgr/docs/ReleaseNotes.md +++ b/amd/comgr/docs/ReleaseNotes.md @@ -1,235 +1,67 @@ -Comgr v3.0 Release Notes +Comgr v4.0 Release Notes ======================== This document contains the release notes for the Code Object Manager (Comgr), -part of the ROCm Software Stack, release v3.0. Here we describe the status of +part of the ROCm Software Stack, release v4.0. Here we describe the status of Comgr, including major improvements from the previous release and new feature -These are in-progress notes for the upcoming Comgr v3.0 release. +These are in-progress notes for the upcoming Comgr v4.0 release. Release notes for previous releases can be found in [docs/historical](docs/historical). Potentially Breaking Changes ---------------------------- These changes are ones which we think may surprise users when upgrading to -Comgr v3.0 because of the opportunity they pose for disruption to existing +Comgr v4.0 because of the opportunity they pose for disruption to existing code bases. -- Removed -h option from comgr-objdump: The -h option (short for -headers) is a -legal comgr-objdump option. However registering this as an LLVM option by Comgr -prevents other LLVM tools or instances from registering a -h option in the same -process, which is an issue because -h is a common short form for -help. -- Updated default code object version used when linking code object specific -device library from v4 to v5 -- Updated shared library name on Windows 64-bit to include Comgr major version -(libamd\_comgr.dll -> libamd\_comgr\_X.dll, where X is the major version) -- oclc\_daz\_opt\_on.bc and oclc\_daz\_opt\_off.bc, and the corresponding - variable \_\_oclc\_daz\_opt are no longer necessary. -- Updated default device library linking behavior for several actions. - Previously, linking was done for some actions and not others, and not - controllable by the user. Now, linking is not done by default, but can - optionally be enabled via the - amd\_comgr\_action\_info\_set\_device\_lib\_linking() API. Users relying - on enabled-by-default behavior should update to use the new API to avoid - changes in behavior. - - Note: This does not apply to the \*COMPILE\_SOURCE\_WITH\_DEVICE\_LIBS\_TO\_BC - action. This action is not affected by the - amd\_comgr\_action\_info\_set\_device\_lib\_linking() API. The new API will - allow us to deprecate and remove this action in favor of the - \*COMPILE\_SOURCE\_TO\_BC action. New Features ------------ -- Added support for linking code\_object\_v4/5 device library files. -- Enabled llvm dylib builds. When llvm dylibs are enabled, a new package -rocm-llvm-core will contain the required dylibs for Comgr. -- Moved build to C++17, allowing us to use more modern features in the -implementation and tests. -- Enabled thread-safe execution of Comgr by enclosing primary Comgr actions in -an std::scoped\_lock() -- Added support for bitcode and archive unbundling during linking via the new -llvm OffloadBundler API. -- Added support for code object v6 and generic targets. -- Added mechanism to bypass device library file system writes if Comgr is able -to locate a local device library directory via the clang-resource-dir +- Added a Comgr Caching infrastructure, currently covering the following +behaviors: + - caching unbundling of compressed clang offload bundles + - caching SPIR-V to LLVM IR translations + - caching clang driver invocations + More information about the Comgr Caching infrastructure and how to use it can + be found in amd/comgr/README.md. +- Updated the license used for Comgr from Illinois to Apache 2.0 with LLVM +Extensions (the same license used by LLVM). +- Added Image Support to Comgr's handling of ISA metadata. Support for images +can now be queried with Comgr's metadata APIs. +- Added support for linking device library files through the use of a Virtual +File System (VFS). Bug Fixes --------- -- Fixed symbolizer assertion for non-null terminated file-slice content, -by bypassing null-termination check in llvm::MemoryBuffer -- Fixed bug and add error checking for internal unbundling. Previously internal -unbundler would fail if files weren't already present in filesystem. -- Fixed issue where lookUpCodeObject() would fail if code object ISA strings -weren't listed in order. -- Added support for subdirectories in amd\_comgr\_set\_data\_name(). Previously -names with a "/" would generate a file-not-found error. -- Added amdgpu-internalize-symbols option to bitcode codegen action, which has -significant performance implications -- Fixed an issue where -nogpulib was always included in HIP compilations, which -prevented correct execution of -COMPILE\_SOURCE\_WITH\_DEVICE\_LIBS\_TO\_BC action. -- Fixed a multi-threading bug where programs would hang when calling Comgr APIs -like amd\_comgr\_iterate\_symbols() from multiple threads -- Fixed an issue where providing DataObjects with an empty name to the bitcode -linking action caused errors when AMD\_COMGR\_SAVE\_TEMPS was enabled, or when -linking bitcode bundles. -- Updated to use lld::lldMain() introduced in D110949 instead of the older -lld::elf::link in Comgr's linkWithLLD() -- Added -x assembler option to assembly compilation. Before, if an assembly file -did not end with a .s file extension, it was not handled properly by the Comgr -ASSEMBLE\_SOURCE\_TO\_RELOCATABLE action. -- Switched getline() from C++ to C-style to avoid issues with stdlibc++ and -pytorch -- Added new -relink-builtin-bitcode-postop LLVM option to device library. This -fixes an issue with the \*COMPILE\_SOURCE\_WITH\_DEVICE\_LIBRARIES\_TO\_BC where -OpenCL applications that leveraged AMDGPUSimplifyLibCalls optimizations would -need to re-link bitcodes separately to avoid errors at runtime. -- Correctly set directory to object file path when forwarding -save-temps for -HIP compilations with AMD\_COMGR\_SAVE\_TEMPS set -- Added new ['--skip-line-zero'](https://github.com/llvm/llvm-project/pull/82240) -LLVM option by default in comgr-symbolizer to support symbolization of instructions -having no source correspondence in the debug information. New APIs -------- -- amd\_comgr\_populate\_mangled\_names() (v2.5) -- amd\_comgr\_get\_mangled\_name() (v2.5) - - Support bitcode and executable name lowering. The first call populates a - list of mangled names for a given data object, while the second fetches a - name from a given object and index. -- amd\_comgr\_populate\_name\_expression\_map() (v2.6) -- amd\_comgr\_map\_name\_expression\_to\_symbol\_name() (v2.6) - - Support bitcode and code object name expression mapping. The first call - populates a map of name expressions for a given comgr data object, using - LLVM APIs to traverse the bitcode or code object. The second call returns - a value (mangled symbol name) from the map for a given key (unmangled - name expression). These calls assume that names of interest have been - enclosed the HIP runtime using a stub attribute containg the following - string in the name: "__amdgcn_name_expr". -- amd\_comgr\_map\_elf\_virtual\_address\_to\_code\_object\_offset() (v2.7) - - For a given executable and ELF virtual address, return a code object - offset. This API will benifet the ROCm debugger and profilier -- amd\_comgr\_action\_info\_set\_bundle\_entry\_ids() (v2.8) -- amd\_comgr\_action\_info\_get\_bundle\_entry\_id\_count() (v2.8) -- amd\_comgr\_action\_info\_get\_bundle\_entry\_id() (v2.8) - - A user can provide a set of bundle entry IDs, which are processed when - calling the AMD\_COMGR\_UNBUNDLE action -- amd\_comgr\_action\_info\_set\_device\_lib\_linking() (v2.9) - - By setting this ActionInfo property, a user can explicitly dictate if - device libraries should be linked for a given action. (Previouly, the - action type implicitly determined device library linking). - +- amd\_comgr\_info\_set\_vfs\_() (v3.1) + - By setting this ActionInfo property, users can explicitly dictate if + device libraries should be linked using the real file system or a + Virtual File System (VFS). Deprecated APIs --------------- Removed APIs ------------ -- amd\_comgr\_action\_info\_set\_options() (v3.0) -- amd\_comgr\_action\_info\_get\_options() (v3.0) - - Use amd\_comgr\_action\_info\_set\_option\_list(), - amd\_comgr\_action\_info\_get\_option\_list\_count(), and - amd\_comgr\_action\_info\_get\_option\_list\_item() instead New Comgr Actions and Data Types -------------------------------- -- (Action) AMD\_COMGR\_ACTION\_COMPILE\_SOURCE\_TO\_RELOCATABLE - - This action performs compile-to-bitcode, linking device libraries, and -codegen-to-relocatable in a single step. By doing so, clients are able to defer more -of the flag handling to toolchain. Currently only supports HIP. -- (Data Type) AMD\_COMGR\_DATA\_KIND\_BC\_BUNDLE -- (Data Type) AMD\_COMGR\_DATA\_KIND\_AR\_BUNDLE - - These data kinds can now be passed to an AMD\_COMGR\_ACTION\_LINK\_BC\_TO\_BC -action, and Comgr will internally unbundle and link via the OffloadBundler and linkInModule APIs. -- (Language Type) AMD\_COMGR\_LANGUAGE\_LLVM\_IR - - This language can now be passed to AMD\_COMGR\_ACTION\_COMPILE\_\* actions - to enable compilation of LLVM IR (.ll or .bc) files. This is useful for MLIR - contexts. -- (Action) AMD\_COMGR\_ACTION\_COMPILE\_SOURCE\_TO\_EXECUTABLE - - This action allows compilation from source directly to executable, including - linking device libraries. -- (Action) AMD\_COMGR\_ACTION\_UNBUNDLE - - This accepts a set of bitcode bundles, object file bundles, and archive - bundles,and returns set of unbundled bitcode, object files, and archives, - selecting bundles based on the bundle entry IDs provided. -- (Data Type) AMD\_COMGR\_DATA\_KIND\_OBJ\_BUNDLE - - This data kind represents a clang-offload-bundle of object files, and can be - passed when calling the AMD\_COMGR\_ACTION\_UNBUNDLE action -- (Data Type) AMD\_COMGR\_DATA\_KIND\_SPIRV - - This data kind represents a SPIR-V binary file (.spv) -- (Action) AMD\_COMGR\_ACTION\_TRANSLATE\_SPIRV\_TO\_BC - - This accepts a set of SPIR-V (.spv) inputs, and returns a set of translated - bitcode (.bc) outputs Deprecated Comgr Actions and Data Types --------------------------------------- Removed Comgr Actions and Data Types ------------------------------------ -- (Action) AMD\_COMGR\_ACTION\_COMPILE\_SOURCE\_TO\_FATBIN - - This workaround has been removed in favor of - \*\_COMPILE\_SOURCE\_(WITH\_DEVICE\_LIBS\_)TO\_BC -- (Action) AMD\_COMGR\_ACTION\_OPTIMIZE\_BC\_TO\_BC - - This is a legacy action that was never implemented -- (Language) AMD\_COMGR\_LANGUAGE\_HC - - This is a legacy language that was never used -- (Action) AMD\_COMGR\_ACTION\_ADD\_DEVICE\_LIBRARIES - - This has been replaced with - AMD\_COMGR\_ACTION\_COMPILE\_SOURCE\_WITH\_DEVICE\_LIBS\_TO\_BC Comgr Testing, Debugging, and Logging Updates --------------------------------------------- -- Added support for C++ tests. Although Comgr APIs are C-compatible, we can now -use C++ features in testing (C++ threading APIs, etc.) -- Clean up test directory by moving sources to subdirectory -- Several tests updated to pass while verbose logs are redirected to stdout -- Log information reported when AMD\_COMGR\_EMIT\_VERBOSE\_LOGS updated to: - - Show both user-facing clang options used (Compilation Args) and internal - driver options (Driver Job Args) - - Show files linked by linkBitcodeToBitcode() -- Remove support for code object v2 compilation in tests and test CMAKE due to -deprecation of code object v2 in LLVM. However, we still test loading and -metadata querys for code object v2 objects. -- Remove support for code object v3 compilation in tests and test CMAKE due to -deprecation of code object v3 in LLVM. However, we still test loading and -metadata querys for code object v3 objects. -- Revamp symbolizer test to fail on errors, among other improvments -- Improve linking and unbundling log to correctly store temporary files in /tmp, -and to output clang-offload-bundler command to allow users to re-create Comgr -unbundling. -- Add git branch and commit hash for Comgr, and commit hash for LLVM to log -output for Comgr actions. This can help us debug issues more quickly in cases -where reporters provide Comgr logs. -- Fix multiple bugs with mangled names test -- Update default arch for test binaries from gfx830 to gfx900 -- Refactor nested kernel behavior into new test, as this behavior is less common -and shouldn't be featured in the baseline tests -- Add metadata parsing tests for code objects with multiple AMDGPU metadata note entries. -- Updated Comgr HIP test to not rely on HIP\_COMPILER being set, or a valid HIP -installation. We can test the functionality of Comgr HIP compilation without -directly relying on HIP -- Added framework for Comgr lit tests. These tests will allow us to easily -validate generated artifacts with command-line tools like llvm-dis, -llvm-objdump, etc. Moving forward, most new Comgr tests should be written as -lit tests, and tests in comgr/test should be transitioned to comgr/test-lit. New Targets ----------- - - gfx940 - - gfx941 - - gfx942 - - gfx950 - - gfx1036 - - gfx1150 - - gfx1151 - - gfx1152 - - gfx9-generic - - gfx9-4-generic - - gfx10-1-generic - - gfx10-3-generic - - gfx11-generic - - gfx12-generic Removed Targets --------------- diff --git a/amd/comgr/docs/historical/ReleaseNotes-ComgrV3.md b/amd/comgr/docs/historical/ReleaseNotes-ComgrV3.md new file mode 100644 index 0000000000000..d089cd2699f2a --- /dev/null +++ b/amd/comgr/docs/historical/ReleaseNotes-ComgrV3.md @@ -0,0 +1,243 @@ +Comgr v3.0 Release Notes +======================== + +This document contains the release notes for the Code Object Manager (Comgr), +part of the ROCm Software Stack, release v3.0. Here we describe the status of +Comgr, including major improvements from the previous release and new feature + +These are in-progress notes for the upcoming Comgr v3.0 release. +Release notes for previous releases can be found in +[docs/historical](docs/historical). + +Potentially Breaking Changes +---------------------------- +These changes are ones which we think may surprise users when upgrading to +Comgr v3.0 because of the opportunity they pose for disruption to existing +code bases. + +- Removed -h option from comgr-objdump: The -h option (short for -headers) is a +legal comgr-objdump option. However registering this as an LLVM option by Comgr +prevents other LLVM tools or instances from registering a -h option in the same +process, which is an issue because -h is a common short form for -help. +- Updated default code object version used when linking code object specific +device library from v4 to v5 +- Updated shared library name on Windows 64-bit to include Comgr major version +(libamd\_comgr.dll -> libamd\_comgr\_X.dll, where X is the major version) +- oclc\_daz\_opt\_on.bc and oclc\_daz\_opt\_off.bc, and the corresponding + variable \_\_oclc\_daz\_opt are no longer necessary. +- Updated default device library linking behavior for several actions. + Previously, linking was done for some actions and not others, and not + controllable by the user. Now, linking is not done by default, but can + optionally be enabled via the + amd\_comgr\_action\_info\_set\_device\_lib\_linking() API. Users relying + on enabled-by-default behavior should update to use the new API to avoid + changes in behavior. + + Note: This does not apply to the \*COMPILE\_SOURCE\_WITH\_DEVICE\_LIBS\_TO\_BC + action. This action is not affected by the + amd\_comgr\_action\_info\_set\_device\_lib\_linking() API. The new API will + allow us to deprecate and remove this action in favor of the + \*COMPILE\_SOURCE\_TO\_BC action. + +New Features +------------ +- Added support for linking code\_object\_v4/5 device library files. +- Enabled llvm dylib builds. When llvm dylibs are enabled, a new package +rocm-llvm-core will contain the required dylibs for Comgr. +- Moved build to C++17, allowing us to use more modern features in the +implementation and tests. +- Enabled thread-safe execution of Comgr by enclosing primary Comgr actions in +an std::scoped\_lock() +- Added support for bitcode and archive unbundling during linking via the new +llvm OffloadBundler API. +- Added support for code object v6 and generic targets. +- Added mechanism to bypass device library file system writes if Comgr is able +to locate a local device library directory via the clang-resource-dir + +Bug Fixes +--------- +- Fixed symbolizer assertion for non-null terminated file-slice content, +by bypassing null-termination check in llvm::MemoryBuffer +- Fixed bug and add error checking for internal unbundling. Previously internal +unbundler would fail if files weren't already present in filesystem. +- Fixed issue where lookUpCodeObject() would fail if code object ISA strings +weren't listed in order. +- Added support for subdirectories in amd\_comgr\_set\_data\_name(). Previously +names with a "/" would generate a file-not-found error. +- Added amdgpu-internalize-symbols option to bitcode codegen action, which has +significant performance implications +- Fixed an issue where -nogpulib was always included in HIP compilations, which +prevented correct execution of +COMPILE\_SOURCE\_WITH\_DEVICE\_LIBS\_TO\_BC action. +- Fixed a multi-threading bug where programs would hang when calling Comgr APIs +like amd\_comgr\_iterate\_symbols() from multiple threads +- Fixed an issue where providing DataObjects with an empty name to the bitcode +linking action caused errors when AMD\_COMGR\_SAVE\_TEMPS was enabled, or when +linking bitcode bundles. +- Updated to use lld::lldMain() introduced in D110949 instead of the older +lld::elf::link in Comgr's linkWithLLD() +- Added -x assembler option to assembly compilation. Before, if an assembly file +did not end with a .s file extension, it was not handled properly by the Comgr +ASSEMBLE\_SOURCE\_TO\_RELOCATABLE action. +- Switched getline() from C++ to C-style to avoid issues with stdlibc++ and +pytorch +- Added new -relink-builtin-bitcode-postop LLVM option to device library. This +fixes an issue with the \*COMPILE\_SOURCE\_WITH\_DEVICE\_LIBRARIES\_TO\_BC where +OpenCL applications that leveraged AMDGPUSimplifyLibCalls optimizations would +need to re-link bitcodes separately to avoid errors at runtime. +- Correctly set directory to object file path when forwarding -save-temps for +HIP compilations with AMD\_COMGR\_SAVE\_TEMPS set +- Added new ['--skip-line-zero'](https://github.com/llvm/llvm-project/pull/82240) +LLVM option by default in comgr-symbolizer to support symbolization of instructions +having no source correspondence in the debug information. + +New APIs +-------- +- amd\_comgr\_populate\_mangled\_names() (v2.5) +- amd\_comgr\_get\_mangled\_name() (v2.5) + - Support bitcode and executable name lowering. The first call populates a + list of mangled names for a given data object, while the second fetches a + name from a given object and index. +- amd\_comgr\_populate\_name\_expression\_map() (v2.6) +- amd\_comgr\_map\_name\_expression\_to\_symbol\_name() (v2.6) + - Support bitcode and code object name expression mapping. The first call + populates a map of name expressions for a given comgr data object, using + LLVM APIs to traverse the bitcode or code object. The second call returns + a value (mangled symbol name) from the map for a given key (unmangled + name expression). These calls assume that names of interest have been + enclosed the HIP runtime using a stub attribute containg the following + string in the name: "__amdgcn_name_expr". +- amd\_comgr\_map\_elf\_virtual\_address\_to\_code\_object\_offset() (v2.7) + - For a given executable and ELF virtual address, return a code object + offset. This API will benifet the ROCm debugger and profilier +- amd\_comgr\_action\_info\_set\_bundle\_entry\_ids() (v2.8) +- amd\_comgr\_action\_info\_get\_bundle\_entry\_id\_count() (v2.8) +- amd\_comgr\_action\_info\_get\_bundle\_entry\_id() (v2.8) + - A user can provide a set of bundle entry IDs, which are processed when + calling the AMD\_COMGR\_UNBUNDLE action +- amd\_comgr\_action\_info\_set\_device\_lib\_linking() (v2.9) + - By setting this ActionInfo property, a user can explicitly dictate if + device libraries should be linked for a given action. (Previouly, the + action type implicitly determined device library linking). + + +Deprecated APIs +--------------- + +Removed APIs +------------ +- amd\_comgr\_action\_info\_set\_options() (v3.0) +- amd\_comgr\_action\_info\_get\_options() (v3.0) + - Use amd\_comgr\_action\_info\_set\_option\_list(), + amd\_comgr\_action\_info\_get\_option\_list\_count(), and + amd\_comgr\_action\_info\_get\_option\_list\_item() instead + +New Comgr Actions and Data Types +-------------------------------- +- (Action) AMD\_COMGR\_ACTION\_COMPILE\_SOURCE\_TO\_RELOCATABLE + - This action performs compile-to-bitcode, linking device libraries, and +codegen-to-relocatable in a single step. By doing so, clients are able to defer more +of the flag handling to toolchain. Currently only supports HIP. +- (Data Type) AMD\_COMGR\_DATA\_KIND\_BC\_BUNDLE +- (Data Type) AMD\_COMGR\_DATA\_KIND\_AR\_BUNDLE + - These data kinds can now be passed to an AMD\_COMGR\_ACTION\_LINK\_BC\_TO\_BC +action, and Comgr will internally unbundle and link via the OffloadBundler and linkInModule APIs. +- (Language Type) AMD\_COMGR\_LANGUAGE\_LLVM\_IR + - This language can now be passed to AMD\_COMGR\_ACTION\_COMPILE\_\* actions + to enable compilation of LLVM IR (.ll or .bc) files. This is useful for MLIR + contexts. +- (Action) AMD\_COMGR\_ACTION\_COMPILE\_SOURCE\_TO\_EXECUTABLE + - This action allows compilation from source directly to executable, including + linking device libraries. +- (Action) AMD\_COMGR\_ACTION\_UNBUNDLE + - This accepts a set of bitcode bundles, object file bundles, and archive + bundles,and returns set of unbundled bitcode, object files, and archives, + selecting bundles based on the bundle entry IDs provided. +- (Data Type) AMD\_COMGR\_DATA\_KIND\_OBJ\_BUNDLE + - This data kind represents a clang-offload-bundle of object files, and can be + passed when calling the AMD\_COMGR\_ACTION\_UNBUNDLE action +- (Data Type) AMD\_COMGR\_DATA\_KIND\_SPIRV + - This data kind represents a SPIR-V binary file (.spv) +- (Action) AMD\_COMGR\_ACTION\_TRANSLATE\_SPIRV\_TO\_BC + - This accepts a set of SPIR-V (.spv) inputs, and returns a set of translated + bitcode (.bc) outputs + +Deprecated Comgr Actions and Data Types +--------------------------------------- + +Removed Comgr Actions and Data Types +------------------------------------ +- (Action) AMD\_COMGR\_ACTION\_COMPILE\_SOURCE\_TO\_FATBIN + - This workaround has been removed in favor of + \*\_COMPILE\_SOURCE\_(WITH\_DEVICE\_LIBS\_)TO\_BC +- (Action) AMD\_COMGR\_ACTION\_OPTIMIZE\_BC\_TO\_BC + - This is a legacy action that was never implemented +- (Language) AMD\_COMGR\_LANGUAGE\_HC + - This is a legacy language that was never used +- (Action) AMD\_COMGR\_ACTION\_ADD\_DEVICE\_LIBRARIES + - This has been replaced with + AMD\_COMGR\_ACTION\_COMPILE\_SOURCE\_WITH\_DEVICE\_LIBS\_TO\_BC + +Comgr Testing, Debugging, and Logging Updates +--------------------------------------------- +- Added support for C++ tests. Although Comgr APIs are C-compatible, we can now +use C++ features in testing (C++ threading APIs, etc.) +- Clean up test directory by moving sources to subdirectory +- Several tests updated to pass while verbose logs are redirected to stdout +- Log information reported when AMD\_COMGR\_EMIT\_VERBOSE\_LOGS updated to: + - Show both user-facing clang options used (Compilation Args) and internal + driver options (Driver Job Args) + - Show files linked by linkBitcodeToBitcode() +- Remove support for code object v2 compilation in tests and test CMAKE due to +deprecation of code object v2 in LLVM. However, we still test loading and +metadata querys for code object v2 objects. +- Remove support for code object v3 compilation in tests and test CMAKE due to +deprecation of code object v3 in LLVM. However, we still test loading and +metadata querys for code object v3 objects. +- Revamp symbolizer test to fail on errors, among other improvments +- Improve linking and unbundling log to correctly store temporary files in /tmp, +and to output clang-offload-bundler command to allow users to re-create Comgr +unbundling. +- Add git branch and commit hash for Comgr, and commit hash for LLVM to log +output for Comgr actions. This can help us debug issues more quickly in cases +where reporters provide Comgr logs. +- Fix multiple bugs with mangled names test +- Update default arch for test binaries from gfx830 to gfx900 +- Refactor nested kernel behavior into new test, as this behavior is less common +and shouldn't be featured in the baseline tests +- Add metadata parsing tests for code objects with multiple AMDGPU metadata note entries. +- Updated Comgr HIP test to not rely on HIP\_COMPILER being set, or a valid HIP +installation. We can test the functionality of Comgr HIP compilation without +directly relying on HIP +- Added framework for Comgr lit tests. These tests will allow us to easily +validate generated artifacts with command-line tools like llvm-dis, +llvm-objdump, etc. Moving forward, most new Comgr tests should be written as +lit tests, and tests in comgr/test should be transitioned to comgr/test-lit. + +New Targets +----------- + - gfx940 + - gfx941 + - gfx942 + - gfx1036 + - gfx1150 + - gfx1151 + - gfx1152 + - gfx9-generic + - gfx9-4-generic + - gfx10-1-generic + - gfx10-3-generic + - gfx11-generic + - gfx12-generic + +Removed Targets +--------------- + +Significant Known Problems +-------------------------- +- Several Comgr actions currently write and read files from the filesystem, +which is a known performance issue. We aim to address this by improving +clang's virtual file system support +- Several Comgr actions currently fork new processes for compilation actions. We +aim to address this by librayizing llvm tools that are currently only useable as +a separate process. diff --git a/amd/comgr/include/amd_comgr.h.in b/amd/comgr/include/amd_comgr.h.in index 7e0f69acd62f6..e8774092ec693 100644 --- a/amd/comgr/include/amd_comgr.h.in +++ b/amd/comgr/include/amd_comgr.h.in @@ -234,6 +234,18 @@ extern "C" { */ #define AMD_COMGR_VERSION_2_9 +/** + * The function was introduced or changed in version 3.0 of the interface + * and has the symbol version string of ``"@amd_comgr_NAME@_3.0"``. + */ +#define AMD_COMGR_VERSION_3_0 + +/** + * The function was introduced or changed in version 3.1 of the interface + * and has the symbol version string of ``"@amd_comgr_NAME@_3.1"``. + */ +#define AMD_COMGR_VERSION_3_1 + /** @} */ /** @@ -1421,6 +1433,41 @@ amd_comgr_action_info_get_bundle_entry_id( size_t *size, char *bundle_entry_id) AMD_COMGR_VERSION_2_8; +/** + * @brief Set whether the specified action should use an + * in-memory virtual file system (VFS). + * + * @warning Environment variable @p AMD_COMGR_SAVE_TEMPS may override options + * set by this API and @p AMD_COMGR_USE_VFS. If @p AMD_COMGR_SAVE_TEMPS is set + * to "1", all actions are performed using the real file system irrespective of + * the value of @p should_use_vfs @p AMD_COMGR_USE_VFS; + * + * @warning Environment variable @p AMD_COMGR_USE_VFS may override options + * set by this API. If @p AMD_COMGR_USE_VFS is set to "1", all actions + * are performed using VFS. If @p AMD_COMGR_USE_VFS is set to "0", + * none of the actions are performed using VFS. + * + * If @p AMD_COMGR_USE_VFS is unset, this API can be used to selectively + * turn VFS usage on/off for specified actions. + * + * @param[in] action_info A handle to the action info object to be + * updated. + * + * @param[in] should_use_vfs A boolean that directs the choice to + * use the VFS. + * + * @retval ::AMD_COMGR_STATUS_SUCCESS The function has + * been executed successfully. + * + * @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p + * action_info is an invalid action info object. + * + */ +amd_comgr_status_t AMD_COMGR_API +amd_comgr_action_info_set_vfs( + amd_comgr_action_info_t action_info, + bool should_use_vfs) AMD_COMGR_VERSION_3_1; + /** * @brief Set the device library linking behavior of an action info object. * @@ -1776,6 +1823,23 @@ typedef enum amd_comgr_action_kind_s { */ AMD_COMGR_ACTION_UNBUNDLE = 0xF, + /** + * Compile each source SPIR-V object in @p input into a relocatable. + * For each successful compilation, add a relocatable object to @p result + * + * We accomplish this by first translating the .spv files to .bc via the + * SPIR-V translator. We then extract any SPIR-V flags from the embedded + * @llvm.cmdline variable. Finally, we compile the bitcode to a relocatable, + * appending any extracted flags. + * + * Return @p AMD_COMGR_STATUS_ERROR if any translation, flag extraction, or + * compilation fails. + * + * Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + * if any input is not SPIR-V. + */ + AMD_COMGR_ACTION_COMPILE_SPIRV_TO_RELOCATABLE = 0x10, + /** * Translate each source SPIR-V object in @p input into LLVM IR Bitcode. * For each successful translation, add a bc object to @p result * diff --git a/amd/comgr/src/amdcomgr.def b/amd/comgr/src/amdcomgr.def index 15dffb9324dc3..27b04dd8270f7 100644 --- a/amd/comgr/src/amdcomgr.def +++ b/amd/comgr/src/amdcomgr.def @@ -48,3 +48,4 @@ amd_comgr_action_info_set_bundle_entry_ids amd_comgr_action_info_get_bundle_entry_id_count amd_comgr_action_info_get_bundle_entry_id amd_comgr_action_info_set_device_lib_linking +amd_comgr_action_info_set_vfs diff --git a/amd/comgr/src/comgr-cache-bundler-command.cpp b/amd/comgr/src/comgr-cache-bundler-command.cpp new file mode 100644 index 0000000000000..5142627254d64 --- /dev/null +++ b/amd/comgr/src/comgr-cache-bundler-command.cpp @@ -0,0 +1,181 @@ +/******************************************************************************* + * + * University of Illinois/NCSA + * Open Source License + * + * Copyright (c) 2003-2017 University of Illinois at Urbana-Champaign. + * Modifications (c) 2018 Advanced Micro Devices, Inc. + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the names of the LLVM Team, University of Illinois at + * Urbana-Champaign, nor the names of its contributors may be used to + * endorse or promote products derived from this Software without specific + * prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + ******************************************************************************/ + +#include + +#include +#include + +namespace COMGR { +using namespace llvm; +using namespace clang; + +using SizeFieldType = uint32_t; + +bool UnbundleCommand::canCache() const { + // The header format for AR files is not the same as object files + if (Kind == AMD_COMGR_DATA_KIND_AR_BUNDLE) + return false; + + StringRef InputFilename = Config.InputFileNames.front(); + file_magic Magic; + if (identify_magic(InputFilename, Magic)) + return false; + + // Check the input file magic. Handle only compressed bundles + // It's not worth to cache other types of bundles + return Magic == file_magic::offload_bundle_compressed; +} + +Error UnbundleCommand::writeExecuteOutput(StringRef CachedBuffer) { + for (StringRef OutputFilename : Config.OutputFileNames) { + SizeFieldType OutputFileSize; + if (CachedBuffer.size() < sizeof(OutputFileSize)) + return createStringError(std::errc::invalid_argument, + "Not enough bytes to read output file size"); + memcpy(&OutputFileSize, CachedBuffer.data(), sizeof(OutputFileSize)); + CachedBuffer = CachedBuffer.drop_front(sizeof(OutputFileSize)); + + if (CachedBuffer.size() < OutputFileSize) + return createStringError(std::errc::invalid_argument, + "Not enough bytes to read output file contents"); + + StringRef OutputFileContents = CachedBuffer.substr(0, OutputFileSize); + CachedBuffer = CachedBuffer.drop_front(OutputFileSize); + + if (Error Err = CachedCommandAdaptor::writeUniqueExecuteOutput( + OutputFilename, OutputFileContents)) + return Err; + } + + if (!CachedBuffer.empty()) + return createStringError(std::errc::invalid_argument, + "Bytes in cache entry not used for the output"); + return Error::success(); +} + +Expected UnbundleCommand::readExecuteOutput() { + size_t OutputSize = 0; + for (StringRef OutputFilename : Config.OutputFileNames) { + auto MaybeOneOutput = + CachedCommandAdaptor::readUniqueExecuteOutput(OutputFilename); + if (!MaybeOneOutput) + return MaybeOneOutput.takeError(); + + const MemoryBuffer &OneOutputBuffer = **MaybeOneOutput; + SizeFieldType OneOutputFileSize = OneOutputBuffer.getBufferSize(); + + OutputBuffer.resize_for_overwrite(OutputSize + sizeof(OneOutputFileSize) + + OneOutputFileSize); + + memcpy(OutputBuffer.data() + OutputSize, &OneOutputFileSize, + sizeof(OneOutputFileSize)); + OutputSize += sizeof(OneOutputFileSize); + memcpy(OutputBuffer.data() + OutputSize, OneOutputBuffer.getBufferStart(), + OneOutputFileSize); + OutputSize += OneOutputFileSize; + } + return OutputBuffer; +} + +amd_comgr_status_t UnbundleCommand::execute(raw_ostream &LogS) { + assert(Config.InputFileNames.size() == 1); + + OffloadBundler Bundler(Config); + + switch (Kind) { + case AMD_COMGR_DATA_KIND_BC_BUNDLE: + case AMD_COMGR_DATA_KIND_OBJ_BUNDLE: { + if (Error Err = Bundler.UnbundleFiles()) { + logAllUnhandledErrors(std::move(Err), LogS, "Unbundle Error: "); + return AMD_COMGR_STATUS_ERROR; + } + break; + } + case AMD_COMGR_DATA_KIND_AR_BUNDLE: { + if (Error Err = Bundler.UnbundleArchive()) { + logAllUnhandledErrors(std::move(Err), LogS, "Unbundle Archives Error: "); + return AMD_COMGR_STATUS_ERROR; + } + break; + } + default: + llvm_unreachable("invalid bundle type"); + } + + return AMD_COMGR_STATUS_SUCCESS; +} + +CachedCommandAdaptor::ActionClass UnbundleCommand::getClass() const { + return clang::driver::Action::OffloadUnbundlingJobClass; +} + +void UnbundleCommand::addOptionsIdentifier(HashAlgorithm &H) const { + H.update(Config.TargetNames.size()); + for (StringRef Target : Config.TargetNames) { + CachedCommandAdaptor::addString(H, Target); + } +} + +Error UnbundleCommand::addInputIdentifier(HashAlgorithm &H) const { + StringRef InputFilename = Config.InputFileNames.front(); + + constexpr size_t LargestHeaderSize = CompressedOffloadBundle::V3HeaderSize; + + ErrorOr> MaybeInputBuffer = + MemoryBuffer::getFileSlice(InputFilename, LargestHeaderSize, 0); + if (!MaybeInputBuffer) { + std::error_code EC = MaybeInputBuffer.getError(); + return createStringError(EC, Twine("Failed to open ") + InputFilename + + " : " + EC.message() + "\n"); + } + + MemoryBuffer &InputBuffer = **MaybeInputBuffer; + + uint8_t Header[LargestHeaderSize]; + memset(Header, 0, sizeof(Header)); + memcpy(Header, InputBuffer.getBufferStart(), + std::min(LargestHeaderSize, InputBuffer.getBufferSize())); + + // only hash the input file, not the whole header. Colissions are unlikely + // since the header includes a hash (weak) of the contents + H.update(Header); + return Error::success(); +} + +} // namespace COMGR diff --git a/amd/comgr/src/comgr-cache-bundler-command.h b/amd/comgr/src/comgr-cache-bundler-command.h new file mode 100644 index 0000000000000..cb9735dda2082 --- /dev/null +++ b/amd/comgr/src/comgr-cache-bundler-command.h @@ -0,0 +1,74 @@ +/******************************************************************************* + * + * University of Illinois/NCSA + * Open Source License + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the names of Advanced Micro Devices, Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this Software without specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + ******************************************************************************/ + +#ifndef COMGR_CACHE_BUNDLER_COMMAND_H +#define COMGR_CACHE_BUNDLER_COMMAND_H + +#include + +namespace clang { +class OffloadBundlerConfig; +} // namespace clang + +namespace COMGR { +class UnbundleCommand final : public CachedCommandAdaptor { +private: + amd_comgr_data_kind_t Kind; + const clang::OffloadBundlerConfig &Config; + + // To avoid copies, store the output of execute, such that readExecuteOutput + // can return a reference. + llvm::SmallString<64> OutputBuffer; + +public: + UnbundleCommand(amd_comgr_data_kind_t Kind, + const clang::OffloadBundlerConfig &Config) + : Kind(Kind), Config(Config) {} + + bool canCache() const override; + llvm::Error writeExecuteOutput(llvm::StringRef CachedBuffer) override; + llvm::Expected readExecuteOutput() override; + amd_comgr_status_t execute(llvm::raw_ostream &LogS) override; + + ~UnbundleCommand() override = default; + +protected: + ActionClass getClass() const override; + void addOptionsIdentifier(HashAlgorithm &) const override; + llvm::Error addInputIdentifier(HashAlgorithm &) const override; +}; +} // namespace COMGR + +#endif diff --git a/amd/comgr/src/comgr-cache-command.cpp b/amd/comgr/src/comgr-cache-command.cpp new file mode 100644 index 0000000000000..ee8a300547710 --- /dev/null +++ b/amd/comgr/src/comgr-cache-command.cpp @@ -0,0 +1,136 @@ +#include "comgr-cache-command.h" +#include "comgr-cache.h" +#include "comgr-device-libs.h" +#include "comgr-env.h" +#include "comgr.h" + +#include +#include +#include +#include + +#include + +namespace COMGR { +using namespace llvm; +using namespace clang; + +namespace { +// std::isalnum is locale dependent and can have issues +// depending on the stdlib version and application. We prefer to avoid it +bool isalnum(char c) { + char low[] = {'0', 'a', 'A'}; + char hi[] = {'9', 'z', 'Z'}; + for (unsigned i = 0; i != 3; ++i) { + if (low[i] <= c && c <= hi[i]) + return true; + } + return false; +} +} // namespace + +std::optional CachedCommandAdaptor::searchComgrTmpModel(StringRef S) { + // Ideally, we would use std::regex_search with the regex + // "comgr-[[:alnum:]]{6}". However, due to a bug in stdlibc++ + // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85824) we have to roll our + // own search of this regular expression. This bug resulted in a crash in + // luxmarkv3, during the std::regex constructor. + const StringRef Prefix = "comgr-"; + const size_t AlnumCount = 6; + + size_t N = S.size(); + size_t Pos = S.find(Prefix); + + size_t AlnumStart = Pos + Prefix.size(); + size_t AlnumEnd = AlnumStart + AlnumCount; + if (Pos == StringRef::npos || N < AlnumEnd) + return std::nullopt; + + for (size_t i = AlnumStart; i < AlnumEnd; ++i) { + if (!isalnum(S[i])) + return std::nullopt; + } + + return Pos; +} + +void CachedCommandAdaptor::addString(CachedCommandAdaptor::HashAlgorithm &H, + StringRef S) { + // hash size + contents to avoid collisions + // for example, we have to ensure that the result of hashing "AA" "BB" is + // different from "A" "ABB" + H.update(S.size()); + H.update(S); +} + +void CachedCommandAdaptor::addFileContents( + CachedCommandAdaptor::HashAlgorithm &H, StringRef Buf) { + // this is a workaround temporary paths getting in the output files of the + // different commands in #line directives in preprocessed files, and the + // ModuleID or source_filename in the bitcode. + while (!Buf.empty()) { + std::optional ComgrTmpPos = searchComgrTmpModel(Buf); + if (!ComgrTmpPos) { + addString(H, Buf); + break; + } + StringRef ToHash = Buf.substr(0, *ComgrTmpPos); + addString(H, ToHash); + Buf = Buf.substr(ToHash.size() + StringRef("comgr-xxxxxx").size()); + } +} + +Expected +CachedCommandAdaptor::getIdentifier() const { + CachedCommandAdaptor::HashAlgorithm H; + H.update(getClass()); + H.update(env::shouldEmitVerboseLogs()); + addString(H, getClangFullVersion()); + addString(H, getComgrHashIdentifier()); + addString(H, getDeviceLibrariesIdentifier()); + + if (Error E = addInputIdentifier(H)) + return E; + + addOptionsIdentifier(H); + + CachedCommandAdaptor::Identifier Id; + toHex(H.final(), true, Id); + return Id; +} + +llvm::Error +CachedCommandAdaptor::writeUniqueExecuteOutput(StringRef OutputFilename, + StringRef CachedBuffer) { + std::error_code EC; + raw_fd_ostream Out(OutputFilename, EC); + if (EC) { + Error E = createStringError(EC, Twine("Failed to open ") + OutputFilename + + " : " + EC.message() + "\n"); + return E; + } + + Out.write(CachedBuffer.data(), CachedBuffer.size()); + Out.close(); + if (Out.has_error()) { + Error E = createStringError(EC, Twine("Failed to write ") + OutputFilename + + " : " + EC.message() + "\n"); + return E; + } + + return Error::success(); +} + +Expected> +CachedCommandAdaptor::readUniqueExecuteOutput(StringRef OutputFilename) { + ErrorOr> MBOrErr = + MemoryBuffer::getFile(OutputFilename); + if (!MBOrErr) { + std::error_code EC = MBOrErr.getError(); + return createStringError(EC, Twine("Failed to open ") + OutputFilename + + " : " + EC.message() + "\n"); + } + + return std::move(*MBOrErr); +} +} // namespace COMGR diff --git a/amd/comgr/src/comgr-cache-command.h b/amd/comgr/src/comgr-cache-command.h new file mode 100644 index 0000000000000..6502bc45b573d --- /dev/null +++ b/amd/comgr/src/comgr-cache-command.h @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * University of Illinois/NCSA + * Open Source License + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the names of Advanced Micro Devices, Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this Software without specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + ******************************************************************************/ + +#ifndef COMGR_CACHE_COMMAND_H +#define COMGR_CACHE_COMMAND_H + +#include "amd_comgr.h" + +#include +#include +#include +#include + +namespace llvm { +class raw_ostream; +} + +namespace COMGR { +class CachedCommandAdaptor { +public: + using ActionClass = + std::underlying_type_t; + using HashAlgorithm = llvm::SHA256; + using Identifier = llvm::SmallString<64>; + + llvm::Expected getIdentifier() const; + + virtual bool canCache() const = 0; + virtual llvm::Error writeExecuteOutput(llvm::StringRef CachedBuffer) = 0; + virtual llvm::Expected readExecuteOutput() = 0; + virtual amd_comgr_status_t execute(llvm::raw_ostream &LogS) = 0; + + virtual ~CachedCommandAdaptor() = default; + + // helper to work around the comgr-xxxxx string appearing in files + static void addFileContents(HashAlgorithm &H, llvm::StringRef Buf); + static void addString(HashAlgorithm &H, llvm::StringRef S); + static std::optional searchComgrTmpModel(llvm::StringRef S); + + // helper since several command types just write to a single output file + static llvm::Error writeUniqueExecuteOutput(llvm::StringRef OutputFilename, + llvm::StringRef CachedBuffer); + static llvm::Expected> + readUniqueExecuteOutput(llvm::StringRef OutputFilename); + +protected: + virtual ActionClass getClass() const = 0; + virtual void addOptionsIdentifier(HashAlgorithm &) const = 0; + virtual llvm::Error addInputIdentifier(HashAlgorithm &) const = 0; +}; +} // namespace COMGR + +#endif diff --git a/amd/comgr/src/comgr-cache.cpp b/amd/comgr/src/comgr-cache.cpp new file mode 100644 index 0000000000000..3fdc71a74e790 --- /dev/null +++ b/amd/comgr/src/comgr-cache.cpp @@ -0,0 +1,275 @@ +/******************************************************************************* + * + * University of Illinois/NCSA + * Open Source License + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the names of Advanced Micro Devices, Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this Software without specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + ******************************************************************************/ + +#include "comgr-cache.h" +#include "comgr-cache-command.h" +#include "comgr-env.h" +#include "comgr.h" + +#include +#include +#include + +namespace COMGR { +using namespace llvm; +using namespace clang::driver; + +namespace { + +const unsigned CacheTask = 1; + +void serializeCacheEntry(llvm::raw_ostream &FS, StringRef FileContents, + StringRef Log) { + auto WriteStringRef = [&FS](StringRef Buf) { + uint64_t Size = Buf.size(); + constexpr size_t NSize = sizeof(Size); + char SizeBuf[NSize]; + memcpy(SizeBuf, &Size, NSize); + FS.write(SizeBuf, NSize); + FS.write(Buf.data(), Size); + }; + + for (StringRef *Buf : {&FileContents, &Log}) { + WriteStringRef(*Buf); + } +} + +Error deserializeCacheEntry(const llvm::MemoryBuffer &Buffer, + StringRef &FileContents, StringRef &Log) { + auto ConsumeStringRef = [&](StringRef Buffer, + StringRef &Buf) -> Expected { + uint64_t Size; + constexpr size_t NSize = sizeof(Size); + if (NSize > Buffer.size()) + return createStringError( + "Cache entry file too small: couldn't read buffer size"); + memcpy(&Size, Buffer.data(), NSize); + Buffer = Buffer.substr(NSize); + if (Size > Buffer.size()) + return createStringError( + "Cache entry file too small: couldn't read buffer"); + Buf = Buffer.substr(0, Size); + return Buffer.substr(Size); + }; + + StringRef UnreadBuffer = Buffer.getBuffer(); + for (StringRef *Buf : {&FileContents, &Log}) { + auto ErrOrUnread = ConsumeStringRef(UnreadBuffer, *Buf); + if (!ErrOrUnread) + return ErrOrUnread.takeError(); + UnreadBuffer = *ErrOrUnread; + } + + if (!UnreadBuffer.empty()) + return createStringError( + "Cache entry file too big: extra bytes after the end"); + + return Error::success(); +} + +std::function +getComgrCacheErrorHandler(llvm::raw_ostream &LogS) { + if (!env::shouldEmitVerboseLogs()) { + return [](Error E, const char *) { consumeError(std::move(E)); }; + } + + return [&LogS](Error E, const char *When) { + logAllUnhandledErrors(std::move(E), LogS, + Twine("Comgr cache, ") + When + ": "); + }; +} + +void saveCommandOutput(CachedCommandAdaptor &C, AddStreamFn &AddStream, + StringRef CapturedLogS, raw_ostream &LogS) { + auto ErrorHandler = getComgrCacheErrorHandler(LogS); + + Expected> FileOrErr = + AddStream(CacheTask, ""); + if (!FileOrErr) { + ErrorHandler(FileOrErr.takeError(), "when getting the cached file stream"); + return; + } + + Expected Buffer = C.readExecuteOutput(); + if (!Buffer) { + ErrorHandler(Buffer.takeError(), "when reading command's output"); + return; + } + + CachedFileStream *CFS = FileOrErr->get(); + serializeCacheEntry(*CFS->OS, *Buffer, CapturedLogS); + ErrorHandler(CFS->commit(), "when commiting file stream"); +} + +bool readEntryFromCache(CachedCommandAdaptor &C, MemoryBuffer &CachedBuffer, + raw_ostream &LogS) { + auto ErrorHandler = getComgrCacheErrorHandler(LogS); + + StringRef CachedOutputFile; + StringRef CachedLogS; + if (Error E = + deserializeCacheEntry(CachedBuffer, CachedOutputFile, CachedLogS)) { + ErrorHandler(std::move(E), "when reading the cache entry"); + return false; + } + + if (Error E = C.writeExecuteOutput(CachedOutputFile)) { + ErrorHandler(std::move(E), "when writing the command output"); + return false; + } + + LogS << CachedLogS; + return true; +} +} // namespace + +std::optional +CommandCache::getPolicyFromEnv(llvm::raw_ostream &LogS) { + StringRef PolicyString = COMGR::env::getCachePolicy(); + if (PolicyString.empty()) { + // Default policy: scan at most once per hour, take up at most 75% of + // available disk space or 5GB (whichever is smaller), no limit on number + // or age of files. + + CachePruningPolicy DefaultPolicy; + DefaultPolicy.Interval = std::chrono::hours(1); + DefaultPolicy.Expiration = std::chrono::hours(0); + DefaultPolicy.MaxSizePercentageOfAvailableSpace = 75; + DefaultPolicy.MaxSizeBytes = 5ul << 30; // Gb to byte; + DefaultPolicy.MaxSizeFiles = 0; + return DefaultPolicy; + } + + Expected PolicyOrErr = + parseCachePruningPolicy(PolicyString); + if (!PolicyOrErr) { + auto ErrorHandler = getComgrCacheErrorHandler(LogS); + ErrorHandler(PolicyOrErr.takeError(), "when parsing the cache policy"); + return std::nullopt; + } + return *PolicyOrErr; +} + +void CommandCache::prune() { pruneCache(CacheDir, Policy); } + +std::unique_ptr CommandCache::get(raw_ostream &LogS) { + StringRef CacheDir = env::getCacheDirectory(); + if (CacheDir.empty()) + return nullptr; + + std::optional Policy = + CommandCache::getPolicyFromEnv(LogS); + if (!Policy) + return nullptr; + + return std::unique_ptr(new CommandCache(CacheDir, *Policy)); +} + +CommandCache::CommandCache(StringRef CacheDir, const CachePruningPolicy &Policy) + : CacheDir(CacheDir.str()), Policy(Policy) { + assert(!CacheDir.empty()); +} + +CommandCache::~CommandCache() { prune(); } + +amd_comgr_status_t CommandCache::execute(CachedCommandAdaptor &C, + raw_ostream &LogS) { + + if (!C.canCache()) { + // Do not cache preprocessor commands. + // Handling include directories and constants is hard and this simplifies + // our implementation. Preprocessing is fast. + return C.execute(LogS); + } + + // This lambda will get called when the data is gotten from the cache and + // also after the data was set for a given key. + std::unique_ptr CachedBuffer; + auto AddBuffer = [&CachedBuffer](unsigned Task, const Twine &ModuleName, + std::unique_ptr M) { + CachedBuffer = std::move(M); + }; + + auto ErrorHandler = getComgrCacheErrorHandler(LogS); + + Expected CacheOrErr = + localCache("AMDGPUCompilerCache", "amdgpu-compiler", CacheDir, AddBuffer); + if (!CacheOrErr) { + ErrorHandler(CacheOrErr.takeError(), "when creating cache directory"); + return C.execute(LogS); + } + + auto MaybeId = C.getIdentifier(); + if (!MaybeId) { + ErrorHandler(MaybeId.takeError(), + "when computing the identifier for the command"); + return C.execute(LogS); + } + + FileCache &Cache = *CacheOrErr; + + // If we call the "Cache" function and the data is cached, it will call the + // "AddBuffer" lambda function from the constructor which will in turn take + // ownership of the member buffer that is passed to the callback and put it + // into the CachedBuffer member variable. + Expected AddStreamOrErr = Cache(CacheTask, *MaybeId, ""); + if (!AddStreamOrErr) { + ErrorHandler(AddStreamOrErr.takeError(), + "when building the add stream callback"); + return C.execute(LogS); + } + + // If the "AddStream" is nullptr, then the data was cached and we already + // called the "AddBuffer" lambda. + AddStreamFn &AddStream = *AddStreamOrErr; + if (!AddStream && readEntryFromCache(C, *CachedBuffer, LogS)) { + if (env::shouldEmitVerboseLogs()) + LogS << "Comgr cache: entry " << *MaybeId << " found in cache.\n"; + return AMD_COMGR_STATUS_SUCCESS; + } + + std::string CapturedLogS; + llvm::raw_string_ostream CaptureLogS(CapturedLogS); + amd_comgr_status_t Result = C.execute(CaptureLogS); + CaptureLogS.flush(); + LogS << CapturedLogS; + + if (Result == AMD_COMGR_STATUS_SUCCESS && AddStream) { + saveCommandOutput(C, AddStream, CapturedLogS, LogS); + } + + return Result; +} +} // namespace COMGR diff --git a/amd/comgr/src/comgr-cache.h b/amd/comgr/src/comgr-cache.h new file mode 100644 index 0000000000000..c95ad471aff76 --- /dev/null +++ b/amd/comgr/src/comgr-cache.h @@ -0,0 +1,77 @@ +/******************************************************************************* + * + * University of Illinois/NCSA + * Open Source License + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the names of Advanced Micro Devices, Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this Software without specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + ******************************************************************************/ + +#ifndef COMGR_CACHE_H +#define COMGR_CACHE_H + +#include "amd_comgr.h" +#include "comgr-cache-command.h" + +#include +#include +#include + +#include +#include + +namespace llvm { +class raw_ostream; +} // namespace llvm + +namespace COMGR { +class CommandCache { + std::string CacheDir; + llvm::CachePruningPolicy Policy; + + CommandCache(llvm::StringRef CacheDir, + const llvm::CachePruningPolicy &Policy); + + static std::optional + getPolicyFromEnv(llvm::raw_ostream &LogS); + +public: + static std::unique_ptr get(llvm::raw_ostream &); + + ~CommandCache(); + void prune(); + + /// Checks if the Command C is cached. + /// If it is the case, it replaces its output and logs its error-stream. + /// Otherwise it executes C through the callback Execute + amd_comgr_status_t execute(CachedCommandAdaptor &C, llvm::raw_ostream &LogS); +}; +} // namespace COMGR + +#endif diff --git a/amd/comgr/src/comgr-clang-command.cpp b/amd/comgr/src/comgr-clang-command.cpp new file mode 100644 index 0000000000000..3a3fc92c4edd9 --- /dev/null +++ b/amd/comgr/src/comgr-clang-command.cpp @@ -0,0 +1,177 @@ +#include "comgr-clang-command.h" + +#include +#include + +namespace COMGR { +using namespace llvm; +using namespace clang; +namespace { +bool hasDebugOrProfileInfo(ArrayRef Args) { + // These are too difficult to handle since they generate debug info that + // refers to the temporary paths used by comgr. + const StringRef Flags[] = {"-fdebug-info-kind", "-fprofile", "-coverage", + "-ftime-trace"}; + + for (StringRef Arg : Args) { + for (StringRef Flag : Flags) { + if (Arg.starts_with(Flag)) + return true; + } + } + return false; +} + +Error addFile(CachedCommandAdaptor::HashAlgorithm &H, StringRef Path) { + auto BufOrError = MemoryBuffer::getFile(Path); + if (std::error_code EC = BufOrError.getError()) { + return errorCodeToError(EC); + } + StringRef Buf = BufOrError.get()->getBuffer(); + + CachedCommandAdaptor::addFileContents(H, Buf); + + return Error::success(); +} + +template +bool skipProblematicFlag(IteratorTy &It, const IteratorTy &End) { + // Skip include paths, these should have been handled by preprocessing the + // source first. Sadly, these are passed also to the middle-end commands. Skip + // debug related flags (they should be ignored) like -dumpdir (used for + // profiling/coverage/split-dwarf) + StringRef Arg = *It; + static const StringSet<> FlagsWithPathArg = {"-I", "-dumpdir"}; + bool IsFlagWithPathArg = It + 1 != End && FlagsWithPathArg.contains(Arg); + if (IsFlagWithPathArg) { + ++It; + return true; + } + + // Clang always appends the debug compilation dir, + // even without debug info (in comgr it matches the current directory). We + // only consider it if the user specified debug information + bool IsFlagWithSingleArg = Arg.starts_with("-fdebug-compilation-dir="); + if (IsFlagWithSingleArg) { + return true; + } + + return false; +} + +SmallVector getInputFiles(driver::Command &Command) { + const auto &CommandInputs = Command.getInputInfos(); + + SmallVector Paths; + Paths.reserve(CommandInputs.size()); + + for (const auto &II : CommandInputs) { + if (!II.isFilename()) + continue; + Paths.push_back(II.getFilename()); + } + + return Paths; +} + +bool isSourceCodeInput(const driver::InputInfo &II) { + return driver::types::isSrcFile(II.getType()); +} +} // namespace +ClangCommand::ClangCommand(driver::Command &Command, + DiagnosticOptions &DiagOpts, vfs::FileSystem &VFS, + ExecuteFnTy &&ExecuteImpl) + : Command(Command), DiagOpts(DiagOpts), VFS(VFS), + ExecuteImpl(std::move(ExecuteImpl)) {} + +Error ClangCommand::addInputIdentifier(HashAlgorithm &H) const { + auto Inputs(getInputFiles(Command)); + for (StringRef Input : Inputs) { + if (Error E = addFile(H, Input)) { + // call Error's constructor again to silence copy elision warning + return Error(std::move(E)); + } + } + return Error::success(); +} + +void ClangCommand::addOptionsIdentifier(HashAlgorithm &H) const { + auto Inputs(getInputFiles(Command)); + StringRef Output = Command.getOutputFilenames().front(); + ArrayRef Arguments = Command.getArguments(); + for (auto It = Arguments.begin(), End = Arguments.end(); It != End; ++It) { + if (skipProblematicFlag(It, End)) + continue; + + StringRef Arg = *It; + static const StringSet<> FlagsWithFileArgEmbededInComgr = { + "-include-pch", "-mlink-builtin-bitcode"}; + if (FlagsWithFileArgEmbededInComgr.contains(Arg)) { + // The next argument is a path to a "secondary" input-file (pre-compiled + // header or device-libs builtin) + // These two files kinds of files are embedded in comgr at compile time, + // and in normally their remain constant with comgr's build. The user is + // not able to change them. + ++It; + if (It == End) + break; + continue; + } + + // input files are considered by their content + // output files should not be considered at all + bool IsIOFile = Output == Arg || is_contained(Inputs, Arg); + if (IsIOFile) + continue; + +#ifndef NDEBUG + bool IsComgrTmpPath = + CachedCommandAdaptor::searchComgrTmpModel(Arg).has_value(); + // On debug builds, fail on /tmp/comgr-xxxx/... paths. + // Implicit dependencies should have been considered before. + // On release builds, add them to the hash to force a cache miss. + assert(!IsComgrTmpPath && + "Unexpected flag and path to comgr temporary directory"); +#endif + + addString(H, Arg); + } +} + +ClangCommand::ActionClass ClangCommand::getClass() const { + return Command.getSource().getKind(); +} + +bool ClangCommand::canCache() const { + bool HasOneOutput = Command.getOutputFilenames().size() == 1; + bool IsPreprocessorCommand = getClass() == driver::Action::PreprocessJobClass; + + // This reduces the applicability of the cache, but it helps us deliver + // something now and deal with the PCH issues later. The cache would still + // help for spirv compilation (e.g. bitcode->asm) and for intermediate + // compilation steps + bool HasSourceCodeInput = any_of(Command.getInputInfos(), isSourceCodeInput); + + return HasOneOutput && !IsPreprocessorCommand && !HasSourceCodeInput && + !hasDebugOrProfileInfo(Command.getArguments()); +} + +Error ClangCommand::writeExecuteOutput(StringRef CachedBuffer) { + StringRef OutputFilename = Command.getOutputFilenames().front(); + return CachedCommandAdaptor::writeUniqueExecuteOutput(OutputFilename, + CachedBuffer); +} + +Expected ClangCommand::readExecuteOutput() { + auto MaybeBuffer = CachedCommandAdaptor::readUniqueExecuteOutput( + Command.getOutputFilenames().front()); + if (!MaybeBuffer) + return MaybeBuffer.takeError(); + Output = std::move(*MaybeBuffer); + return Output->getBuffer(); +} + +amd_comgr_status_t ClangCommand::execute(raw_ostream &LogS) { + return ExecuteImpl(Command, LogS, DiagOpts, VFS); +} +} // namespace COMGR diff --git a/amd/comgr/src/comgr-clang-command.h b/amd/comgr/src/comgr-clang-command.h new file mode 100644 index 0000000000000..a16883fb95980 --- /dev/null +++ b/amd/comgr/src/comgr-clang-command.h @@ -0,0 +1,86 @@ +/******************************************************************************* + * + * University of Illinois/NCSA + * Open Source License + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the names of Advanced Micro Devices, Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this Software without specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + ******************************************************************************/ + +#ifndef COMGR_CLANG_COMMAND_H +#define COMGR_CLANG_COMMAND_H + +#include "comgr-cache-command.h" + +#include + +namespace clang { +class DiagnosticOptions; +namespace driver { +class Command; +} // namespace driver +} // namespace clang + +namespace COMGR { +class ClangCommand final : public CachedCommandAdaptor { +public: + using ExecuteFnTy = std::function; + +private: + clang::driver::Command &Command; + clang::DiagnosticOptions &DiagOpts; + llvm::vfs::FileSystem &VFS; + ExecuteFnTy ExecuteImpl; + + // To avoid copies, store the output of execute, such that readExecuteOutput + // can return a reference. + std::unique_ptr Output; + +public: + ClangCommand(clang::driver::Command &Command, + clang::DiagnosticOptions &DiagOpts, llvm::vfs::FileSystem &VFS, + ExecuteFnTy &&ExecuteImpl); + + bool canCache() const override; + llvm::Error writeExecuteOutput(llvm::StringRef CachedBuffer) override; + llvm::Expected readExecuteOutput() override; + amd_comgr_status_t execute(llvm::raw_ostream &LogS) override; + + ~ClangCommand() override = default; + +protected: + ActionClass getClass() const override; + void addOptionsIdentifier(HashAlgorithm &) const override; + llvm::Error addInputIdentifier(HashAlgorithm &) const override; +}; +} // namespace COMGR + +#endif diff --git a/amd/comgr/src/comgr-compiler.cpp b/amd/comgr/src/comgr-compiler.cpp index bc8da2eee8908..82102910a9cd7 100644 --- a/amd/comgr/src/comgr-compiler.cpp +++ b/amd/comgr/src/comgr-compiler.cpp @@ -37,12 +37,15 @@ ******************************************************************************/ #include "comgr-compiler.h" +#include "comgr-cache-bundler-command.h" +#include "comgr-cache.h" +#include "comgr-clang-command.h" #include "comgr-device-libs.h" #include "comgr-diagnostic-handler.h" #include "comgr-env.h" +#include "comgr-spirv-command.h" #include "lld/Common/CommonLinkerContext.h" #include "lld/Common/Driver.h" -#include "clang/Basic/Version.h" #include "clang/CodeGen/CodeGenAction.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/DriverDiagnostic.h" @@ -54,7 +57,9 @@ #include "clang/Frontend/FrontendDiagnostic.h" #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/FrontendTool/Utils.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" @@ -80,14 +85,10 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/Signals.h" -#include "llvm/Support/WithColor.h" #include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Support/WithColor.h" #include "llvm/TargetParser/Host.h" -#ifndef COMGR_DISABLE_SPIRV -#include "LLVMSPIRVLib/LLVMSPIRVLib.h" -#endif - #include "time-stat/ts-interface.h" #include @@ -553,6 +554,11 @@ SmallString<128> getFilePath(DataObject *Object, StringRef Dir) { return Path; } +// TODO: Move inputFromFile and outputToFile within AMDGPUCompiler +// +// Currently, we only invoke these two methods in the context of AMDGPUCompiler. +// Moreover, member functions that deal with file I/O should not worry whether +// the underlying filesystem being used is virtual or real. amd_comgr_status_t inputFromFile(DataObject *Object, StringRef Path) { ProfilePoint Point("FileIO"); auto BufOrError = MemoryBuffer::getFile(Path); @@ -646,7 +652,7 @@ void logArgv(raw_ostream &OS, StringRef ProgramName, amd_comgr_status_t executeCommand(const Command &Job, raw_ostream &LogS, DiagnosticOptions &DiagOpts, - llvm::vfs::FileSystem &VFS) { + llvm::vfs::FileSystem &FS) { TextDiagnosticPrinter DiagClient(LogS, &DiagOpts); IntrusiveRefCntPtr DiagID(new DiagnosticIDs); DiagnosticsEngine Diags(DiagID, &DiagOpts, &DiagClient, false); @@ -672,9 +678,11 @@ amd_comgr_status_t executeCommand(const Command &Job, raw_ostream &LogS, std::unique_ptr Clang(new CompilerInstance()); Clang->setVerboseOutputStream(LogS); + Clang->setFileManager(new FileManager(Clang->getFileSystemOpts(), &FS)); if (!Argv.back()) { Argv.pop_back(); } + if (!CompilerInvocation::CreateFromArgs(Clang->getInvocation(), Argv, Diags)) { return AMD_COMGR_STATUS_ERROR; @@ -682,7 +690,7 @@ amd_comgr_status_t executeCommand(const Command &Job, raw_ostream &LogS, // Internally this call refers to the invocation created above, so at // this point the DiagnosticsEngine should accurately reflect all user // requested configuration from Argv. - Clang->createDiagnostics(VFS, &DiagClient, /* ShouldOwnClient */ false); + Clang->createDiagnostics(FS, &DiagClient, /* ShouldOwnClient */ false); if (!Clang->hasDiagnostics()) { return AMD_COMGR_STATUS_ERROR; } @@ -720,6 +728,15 @@ amd_comgr_status_t executeCommand(const Command &Job, raw_ostream &LogS, return AMD_COMGR_STATUS_SUCCESS; } +std::string getStableCUID(const DataSet *InSet) { + using Hash = CachedCommandAdaptor::HashAlgorithm; + Hash H; + for (const DataObject *Input : InSet->DataObjects) { + CachedCommandAdaptor::addFileContents(H, + StringRef{Input->Data, Input->Size}); + } + return toHex(H.final()); +} } // namespace amd_comgr_status_t @@ -746,12 +763,11 @@ AMDGPUCompiler::executeInProcessDriver(ArrayRef Args) { IntrusiveRefCntPtr DiagID(new DiagnosticIDs); DiagnosticsEngine Diags(DiagID, &*DiagOpts, DiagClient); - auto VFS = llvm::vfs::getRealFileSystem(); - ProcessWarningOptions(Diags, *DiagOpts, *VFS, /*ReportDiags=*/false); + ProcessWarningOptions(Diags, *DiagOpts, *OverlayFS, /*ReportDiags=*/false); Driver TheDriver((Twine(env::getLLVMPath()) + "/bin/clang").str(), llvm::sys::getDefaultTargetTriple(), Diags, - "AMDGPU Code Object Manager", VFS); + "AMDGPU Code Object Manager", OverlayFS); TheDriver.setCheckInputsExist(false); // Log arguments used to build compilation @@ -771,9 +787,17 @@ AMDGPUCompiler::executeInProcessDriver(ArrayRef Args) { return AMD_COMGR_STATUS_ERROR; } + auto Cache = CommandCache::get(LogS); for (auto &Job : C->getJobs()) { - if (auto Status = executeCommand(Job, LogS, *DiagOpts, *VFS)) { - return Status; + ClangCommand C(Job, *DiagOpts, *OverlayFS, executeCommand); + if (Cache) { + if (auto Status = Cache->execute(C, LogS)) { + return Status; + } + } else { + if (auto Status = C.execute(LogS)) { + return Status; + } } } return AMD_COMGR_STATUS_SUCCESS; @@ -861,7 +885,8 @@ amd_comgr_status_t AMDGPUCompiler::removeTmpDirs() { #endif } -amd_comgr_status_t AMDGPUCompiler::processFile(const char *InputFilePath, +amd_comgr_status_t AMDGPUCompiler::processFile(DataObject *Input, + const char *InputFilePath, const char *OutputFilePath) { SmallVector Argv = Args; @@ -882,6 +907,12 @@ amd_comgr_status_t AMDGPUCompiler::processFile(const char *InputFilePath, Argv.push_back("-save-temps=obj"); } + // Add SPIR-V flags + for (auto Flag : Input->SpirvFlags) { + Argv.push_back("-Xclang"); + Argv.push_back(Flag); + } + Argv.push_back(InputFilePath); Argv.push_back("-o"); @@ -893,6 +924,12 @@ amd_comgr_status_t AMDGPUCompiler::processFile(const char *InputFilePath, amd_comgr_status_t AMDGPUCompiler::processFiles(amd_comgr_data_kind_t OutputKind, const char *OutputSuffix) { + return processFiles(OutputKind, OutputSuffix, InSet); +} + +amd_comgr_status_t +AMDGPUCompiler::processFiles(amd_comgr_data_kind_t OutputKind, + const char *OutputSuffix, DataSet *InSet) { for (auto *Input : InSet->DataObjects) { if (Input->DataKind != AMD_COMGR_DATA_KIND_INCLUDE) { continue; @@ -930,7 +967,7 @@ AMDGPUCompiler::processFiles(amd_comgr_data_kind_t OutputKind, auto OutputFilePath = getFilePath(Output, OutputDir); if (auto Status = - processFile(InputFilePath.c_str(), OutputFilePath.c_str())) { + processFile(Input, InputFilePath.c_str(), OutputFilePath.c_str())) { return Status; } @@ -1031,6 +1068,10 @@ amd_comgr_status_t AMDGPUCompiler::addCompilationFlags() { Args.push_back(ROCMIncludePath.c_str()); Args.push_back("-isystem"); Args.push_back(HIPIncludePath.c_str()); + // Pass a cuid that depends on the input files + // Otherwise, a random (which depends on the /tmp/comgr-xxxxx path) cuid is + // generated which causes a cache miss on every run. + Args.push_back(Saver.save("-cuid=" + getStableCUID(InSet)).data()); break; default: return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; @@ -1068,8 +1109,18 @@ amd_comgr_status_t AMDGPUCompiler::addDeviceLibraries() { for (auto DeviceLib : getDeviceLibraries()) { llvm::SmallString<128> DeviceLibPath = DeviceLibsDir; path::append(DeviceLibPath, std::get<0>(DeviceLib)); - if (auto Status = outputToFile(std::get<1>(DeviceLib), DeviceLibPath)) { - return Status; + // TODO: We should abstract the logic of deciding whether to use the VFS + // or the real file system within inputFromFile and outputToFile. + if (UseVFS) { + if (!InMemoryFS->addFile( + DeviceLibPath, /* ModificationTime */ 0, + llvm::MemoryBuffer::getMemBuffer(std::get<1>(DeviceLib)))) { + return AMD_COMGR_STATUS_ERROR; + } + } else { + if (auto Status = outputToFile(std::get<1>(DeviceLib), DeviceLibPath)) { + return Status; + } } } } @@ -1227,9 +1278,10 @@ amd_comgr_status_t AMDGPUCompiler::unbundle() { } // Collect bitcode memory buffers from bitcodes, bundles, and archives + auto Cache = CommandCache::get(LogS); for (auto *Input : InSet->DataObjects) { - std::string FileExtension; + const char *FileExtension; amd_comgr_data_kind_t UnbundledDataKind; switch (Input->DataKind) { case AMD_COMGR_DATA_KIND_BC_BUNDLE: @@ -1260,76 +1312,60 @@ amd_comgr_status_t AMDGPUCompiler::unbundle() { const size_t BufSize = sizeof(char) * 30; char *Buf = (char *)malloc(BufSize); snprintf(Buf, BufSize, "comgr-bundle-%d.%s", std::rand() % 10000, - FileExtension.c_str()); + FileExtension); Input->Name = Buf; } // Write input file system so that OffloadBundler API can process // TODO: Switch write to VFS - std::string InputFilePath = getFilePath(Input, InputDir).str().str(); + SmallString<128> InputFilePath = getFilePath(Input, InputDir); if (auto Status = outputToFile(Input, InputFilePath)) { return Status; } // Bundler input name - BundlerConfig.InputFileNames.push_back(InputFilePath); + BundlerConfig.InputFileNames.emplace_back(InputFilePath); // Generate prefix for output files - std::string OutputPrefix = std::string(Input->Name); + StringRef OutputPrefix = Input->Name; size_t Index = OutputPrefix.find_last_of("."); OutputPrefix = OutputPrefix.substr(0, Index); // Bundler target and output names - for (auto Entry : ActionInfo->BundleEntryIDs) { - BundlerConfig.TargetNames.push_back(Entry); - - // Add an output file for each target - std::string OutputFileName = - OutputPrefix + '-' + Entry + "." + FileExtension; + for (StringRef Entry : ActionInfo->BundleEntryIDs) { + BundlerConfig.TargetNames.emplace_back(Entry); - // TODO: Switch this to LLVM path APIs - std::string OutputFilePath = OutputDir.str().str() + "/" + OutputFileName; - BundlerConfig.OutputFileNames.push_back(OutputFilePath); + SmallString<128> OutputFilePath = OutputDir; + sys::path::append(OutputFilePath, + OutputPrefix + "-" + Entry + "." + FileExtension); + BundlerConfig.OutputFileNames.emplace_back(OutputFilePath); } - OffloadBundler Bundler(BundlerConfig); - - // TODO: log vectors, build clang command if (env::shouldEmitVerboseLogs()) { LogS << "Extracting Bundle:\n" << "\t Unbundled Files Extension: ." << FileExtension << "\n" << "\t Bundle Entry ID: " << BundlerConfig.TargetNames[0] << "\n" << "\t Input Filename: " << BundlerConfig.InputFileNames[0] << "\n" - << "\t Output Filename: " << BundlerConfig.OutputFileNames[0] - << "\n"; + << "\t Output Filenames: "; + for (StringRef OutputFileName : BundlerConfig.OutputFileNames) + LogS << OutputFileName << " "; + LogS << "\n"; LogS.flush(); } - switch (Input->DataKind) { - case AMD_COMGR_DATA_KIND_BC_BUNDLE: { - llvm::Error Err = Bundler.UnbundleFiles(); - llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(), - "Unbundle Bitcodes Error: "); - break; - } - case AMD_COMGR_DATA_KIND_AR_BUNDLE: { - llvm::Error Err = Bundler.UnbundleArchive(); - llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(), - "Unbundle Archives Error: "); - break; - } - case AMD_COMGR_DATA_KIND_OBJ_BUNDLE: { - llvm::Error Err = Bundler.UnbundleFiles(); - llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(), - "Unbundle Objects Error: "); - break; - } - default: - llvm_unreachable("invalid bundle type"); + UnbundleCommand Unbundle(Input->DataKind, BundlerConfig); + if (Cache) { + if (auto Status = Cache->execute(Unbundle, LogS)) { + return Status; + } + } else { + if (auto Status = Unbundle.execute(LogS)) { + return Status; + } } // Add new bitcodes to OutSetT - for (auto OutputFilePath : BundlerConfig.OutputFileNames) { + for (StringRef OutputFilePath : BundlerConfig.OutputFileNames) { amd_comgr_data_t ResultT; @@ -1340,22 +1376,15 @@ amd_comgr_status_t AMDGPUCompiler::unbundle() { ScopedDataObjectReleaser SDOR(ResultT); DataObject *Result = DataObject::convert(ResultT); - if (auto Status = inputFromFile(Result, StringRef(OutputFilePath))) + if (auto Status = inputFromFile(Result, OutputFilePath)) return Status; - StringRef OutputFileName = - llvm::sys::path::filename(StringRef(OutputFilePath)); + StringRef OutputFileName = sys::path::filename(OutputFilePath); Result->setName(OutputFileName); if (auto Status = amd_comgr_data_set_add(OutSetT, ResultT)) { return Status; } - - // Remove input and output file after reading back into Comgr data - if (!env::shouldEmitVerboseLogs()) { - sys::fs::remove(InputFilePath); - sys::fs::remove(OutputFilePath); - } } } @@ -1871,22 +1900,108 @@ amd_comgr_status_t AMDGPUCompiler::linkToExecutable() { return amd_comgr_data_set_add(OutSetT, OutputT); } +// TODO: Generalize this list to include all -cc1 flags and arguments that are +// still valid in a bitcode compilation context +static inline const std::unordered_set ValidSpirvFlags{ + "-fapprox-func", + "-fcolor-diagnostics", + "-fconvergent-functions", + "-fexceptions", + "-ffast-math", + "-ffinite-math-only", + "-ffp-contract=fast", + "-ffp-contract=fast-honor-pragmas", + "-fgpu-rdc", + "-finline-functions", + "-fno-signed-zeros", + "-fno-rounding-math", + "-fno-experimental-relative-c++-abi-vtables", + "-fno-autolink", + "-freciprocal-math", + "-funsafe-math-optimizations", + "-fvisibility=hidden", + "-O0", + "-O1", + "-O2", + "-O3", + "--save-temps"}; + +amd_comgr_status_t AMDGPUCompiler::extractSpirvFlags(DataSet *BcSet) { + + for (auto *Bc : BcSet->DataObjects) { + // Create SPIRV IR Module from Bitcode Buffer + SMDiagnostic SMDiag; + LLVMContext Context; + Context.setDiagnosticHandler( + std::make_unique(this->LogS), true); + + auto Mod = getLazyIRModule( + MemoryBuffer::getMemBuffer(StringRef(Bc->Data, Bc->Size), "", false), + SMDiag, Context, true); + + if (!Mod) { + SMDiag.print("SPIR-V Bitcode", LogS, /* ShowColors */ false); + return AMD_COMGR_STATUS_ERROR; + } + + if (verifyModule(*Mod, &LogS)) + return AMD_COMGR_STATUS_ERROR; + + // Fetch @llvm.cmdline + GlobalVariable *CmdLine = Mod->getNamedGlobal("llvm.cmdline"); + + // Return if no @llvm.cmdline + if (!CmdLine) + return AMD_COMGR_STATUS_SUCCESS; + + if (ConstantDataSequential *CDS = + dyn_cast(CmdLine->getInitializer())) { + + // Add each valid null-terminated '\0' string to Flags + std::string Tmp; + StringRef CmdLineRaw = CDS->getRawDataValues(); + std::stringstream ss(CmdLineRaw.str()); + while (getline(ss, Tmp, '\0')) { + if (Tmp == "--hipstdpar" || Tmp == "-amdgpu-enable-hipstdpar") { + Bc->SpirvFlags.push_back("-mllvm"); + Bc->SpirvFlags.push_back("-amdgpu-enable-hipstdpar"); + } else if (ValidSpirvFlags.count(Tmp)) { + Bc->SpirvFlags.push_back(Saver.save(Tmp.c_str()).data()); + } + } + } + + // COV5 required for SPIRV + Bc->SpirvFlags.push_back("-mcode-object-version=5"); + + if (env::shouldEmitVerboseLogs()) { + LogS << " SPIR-V Flags: " << Bc->Name << "\n"; + for (auto Flag : Bc->SpirvFlags) + LogS << " " << Flag << "\n"; + } + } + + return AMD_COMGR_STATUS_SUCCESS; +} + amd_comgr_status_t AMDGPUCompiler::translateSpirvToBitcode() { + return translateSpirvToBitcodeImpl(InSet, DataSet::convert(OutSetT)); +} + +amd_comgr_status_t +AMDGPUCompiler::translateSpirvToBitcodeImpl(DataSet *SpirvInSet, + DataSet *BcOutSet) { #ifdef COMGR_DISABLE_SPIRV - LogS << "Calling AMDGPUCompiler::translateSpirvToBitcode() not supported " - << "Comgr is built with -DCOMGR_DISABLE_SPIRV. Re-build LLVM and Comgr " - << "with LLVM-SPIRV-Translator support to continue.\n"; + LogS << "Calling AMDGPUCompiler::translateSpirvToBitcodeImpl() not " + << "supported. Comgr is built with -DCOMGR_DISABLE_SPIRV. Re-build LLVM " + << "and Comgr with LLVM-SPIRV-Translator support to continue.\n"; return AMD_COMGR_STATUS_ERROR; #else if (auto Status = createTmpDirs()) { return Status; } - LLVMContext Context; - Context.setDiagnosticHandler( - std::make_unique(this->LogS), true); - - for (auto *Input : InSet->DataObjects) { + for (auto *Input : SpirvInSet->DataObjects) { if (env::shouldSaveTemps()) { if (auto Status = outputToFile(Input, getFilePath(Input, InputDir))) { @@ -1898,28 +2013,20 @@ amd_comgr_status_t AMDGPUCompiler::translateSpirvToBitcode() { return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; } - // TODO: With C++23, we should investigate replacing with spanstream - // to avoid memory copies: - // https://en.cppreference.com/w/cpp/io/basic_ispanstream - std::istringstream ISS(std::string(Input->Data, Input->Size)); - - llvm::Module *M; - std::string Err; - - SPIRV::TranslatorOpts Opts; - Opts.enableAllExtensions(); - Opts.setDesiredBIsRepresentation(SPIRV::BIsRepresentation::OpenCL20); + SmallString<0> OutBuf; + SPIRVCommand SPIRV(Input, OutBuf); - if (!llvm::readSpirv(Context, Opts, ISS, M, Err)) { - LogS << "Failed to load SPIR-V as LLVM Module: " << Err << '\n'; - return AMD_COMGR_STATUS_ERROR; + auto Cache = CommandCache::get(LogS); + amd_comgr_status_t Status; + if (!Cache) { + Status = SPIRV.execute(LogS); + } else { + Status = Cache->execute(SPIRV, LogS); } - SmallString<0> OutBuf; - BitcodeWriter Writer(OutBuf); - Writer.writeModule(*M, false, nullptr, false, nullptr); - Writer.writeSymtab(); - Writer.writeStrtab(); + if (Status) { + return Status; + } amd_comgr_data_t OutputT; if (auto Status = amd_comgr_create_data(AMD_COMGR_DATA_KIND_BC, &OutputT)) { @@ -1933,7 +2040,8 @@ amd_comgr_status_t AMDGPUCompiler::translateSpirvToBitcode() { Output->setName(std::string(Input->Name) + std::string(".bc")); Output->setData(OutBuf); - if (auto Status = amd_comgr_data_set_add(OutSetT, OutputT)) { + if (auto Status = + amd_comgr_data_set_add(DataSet::convert(BcOutSet), OutputT)) { return Status; } @@ -1952,11 +2060,74 @@ amd_comgr_status_t AMDGPUCompiler::translateSpirvToBitcode() { #endif } +amd_comgr_status_t AMDGPUCompiler::compileSpirvToRelocatable() { + if (auto Status = createTmpDirs()) { + return Status; + } + + for (auto *Input : InSet->DataObjects) { + if (Input->DataKind != AMD_COMGR_DATA_KIND_SPIRV) + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + + // Translate .spv to .bc + amd_comgr_data_set_t TranslatedSpirvT; + if (auto Status = amd_comgr_create_data_set(&TranslatedSpirvT)) + return Status; + DataSet *TranslatedSpirv = DataSet::convert(TranslatedSpirvT); + + if (auto Status = translateSpirvToBitcodeImpl(InSet, TranslatedSpirv)) + return Status; + + // Extract any SPIR-V flags from @llvm.cmdline + if (auto Status = extractSpirvFlags(TranslatedSpirv)) + return Status; + + // Compile bitcode to relocatable + if (ActionInfo->IsaName) { + if (auto Status = addTargetIdentifierFlags(ActionInfo->IsaName)) { + return Status; + } + } + + if (ActionInfo->ShouldLinkDeviceLibs) { + if (auto Status = addDeviceLibraries()) { + return Status; + } + } + + Args.push_back("-c"); + + Args.push_back("-mllvm"); + Args.push_back("-amdgpu-internalize-symbols"); + + return processFiles(AMD_COMGR_DATA_KIND_RELOCATABLE, ".o", TranslatedSpirv); +} + AMDGPUCompiler::AMDGPUCompiler(DataAction *ActionInfo, DataSet *InSet, DataSet *OutSet, raw_ostream &LogS) : ActionInfo(ActionInfo), InSet(InSet), OutSetT(DataSet::convert(OutSet)), LogS(LogS) { initializeCommandLineArgs(Args); + + // Initialize OverlayFS with the real file system which helps redirect + // non-VFS reads and writes. + OverlayFS = new vfs::OverlayFileSystem(vfs::getRealFileSystem()); + + std::optional VFSStatus = env::shouldUseVFS(); + if ((VFSStatus.has_value() && *VFSStatus) || + (!VFSStatus.has_value() && ActionInfo->ShouldUseVFS)) { + if (env::shouldEmitVerboseLogs()) { + LogS << " File System: VFS\n"; + } + UseVFS = true; + InMemoryFS = new vfs::InMemoryFileSystem; + OverlayFS->pushOverlay(InMemoryFS); + } else { + if (env::shouldEmitVerboseLogs()) { + LogS << " File System: Real\n"; + } + } } AMDGPUCompiler::~AMDGPUCompiler() { diff --git a/amd/comgr/src/comgr-compiler.h b/amd/comgr/src/comgr-compiler.h index 39a2842664349..723488362d05a 100644 --- a/amd/comgr/src/comgr-compiler.h +++ b/amd/comgr/src/comgr-compiler.h @@ -41,6 +41,7 @@ #include "comgr.h" #include "clang/Driver/Driver.h" +#include "llvm/Support/VirtualFileSystem.h" namespace COMGR { @@ -72,22 +73,32 @@ class AMDGPUCompiler { llvm::StringSaver Saver = Allocator; /// Whether we need to disable Clang's device-lib linking. bool NoGpuLib = true; + bool UseVFS = false; + + llvm::IntrusiveRefCntPtr OverlayFS; + llvm::IntrusiveRefCntPtr InMemoryFS; amd_comgr_status_t createTmpDirs(); amd_comgr_status_t removeTmpDirs(); - amd_comgr_status_t processFile(const char *InputFilePath, + amd_comgr_status_t processFile(DataObject *Input, const char *InputFilePath, const char *OutputFilePath); /// Process each file in @c InSet individually, placing output in @c OutSet. amd_comgr_status_t processFiles(amd_comgr_data_kind_t OutputKind, const char *OutputSuffix); + amd_comgr_status_t processFiles(amd_comgr_data_kind_t OutputKind, + const char *OutputSuffix, DataSet *InSet); amd_comgr_status_t addIncludeFlags(); amd_comgr_status_t addTargetIdentifierFlags(llvm::StringRef IdentStr, bool CompilingSrc); amd_comgr_status_t addCompilationFlags(); amd_comgr_status_t addDeviceLibraries(); + amd_comgr_status_t extractSpirvFlags(DataSet *BcSet); amd_comgr_status_t executeInProcessDriver(llvm::ArrayRef Args); + amd_comgr_status_t translateSpirvToBitcodeImpl(DataSet *SpirvInSet, + DataSet *BcOutSet); + public: AMDGPUCompiler(DataAction *ActionInfo, DataSet *InSet, DataSet *OutSet, llvm::raw_ostream &LogS); @@ -104,6 +115,7 @@ class AMDGPUCompiler { amd_comgr_status_t linkToRelocatable(); amd_comgr_status_t linkToExecutable(); amd_comgr_status_t compileToExecutable(); + amd_comgr_status_t compileSpirvToRelocatable(); amd_comgr_status_t translateSpirvToBitcode(); amd_comgr_language_t getLanguage() const { return ActionInfo->Language; } diff --git a/amd/comgr/src/comgr-device-libs.cpp b/amd/comgr/src/comgr-device-libs.cpp index f991a47a3014c..4e11740d3564d 100644 --- a/amd/comgr/src/comgr-device-libs.cpp +++ b/amd/comgr/src/comgr-device-libs.cpp @@ -59,10 +59,15 @@ amd_comgr_status_t addObject(DataSet *DataSet, amd_comgr_data_kind_t Kind, DataSet->DataObjects.insert(Obj); return AMD_COMGR_STATUS_SUCCESS; } -} // namespace +#include "libraries.inc" +#include "libraries_sha.inc" #include "opencl1.2-c.inc" #include "opencl2.0-c.inc" +} // namespace + +StringRef getDeviceLibrariesIdentifier() { return DEVICE_LIBS_ID; } + amd_comgr_status_t addPrecompiledHeaders(DataAction *ActionInfo, DataSet *ResultSet) { switch (ActionInfo->Language) { @@ -77,7 +82,6 @@ amd_comgr_status_t addPrecompiledHeaders(DataAction *ActionInfo, } } -#include "libraries.inc" llvm::ArrayRef> getDeviceLibraries() { static std::tuple DeviceLibs[] = { diff --git a/amd/comgr/src/comgr-device-libs.h b/amd/comgr/src/comgr-device-libs.h index fe99c6926724c..f0a5b9b325912 100644 --- a/amd/comgr/src/comgr-device-libs.h +++ b/amd/comgr/src/comgr-device-libs.h @@ -49,6 +49,8 @@ struct DataSet; amd_comgr_status_t addPrecompiledHeaders(DataAction *ActionInfo, DataSet *ResultSet); +llvm::StringRef getDeviceLibrariesIdentifier(); + llvm::ArrayRef> getDeviceLibraries(); diff --git a/amd/comgr/src/comgr-disassembly.cpp b/amd/comgr/src/comgr-disassembly.cpp index 3bf0e37c62990..e16598d1513c8 100644 --- a/amd/comgr/src/comgr-disassembly.cpp +++ b/amd/comgr/src/comgr-disassembly.cpp @@ -50,7 +50,7 @@ DisassemblyInfo::create(const TargetIdentifier &Ident, "-" + Ident.Environ) .str(); std::string Isa = TT + Twine("-" + Ident.Processor).str(); - SmallVector FeaturesVec; + SmallVector FeaturesVec; for (auto &Feature : Ident.Features) { FeaturesVec.push_back( diff --git a/amd/comgr/src/comgr-env.cpp b/amd/comgr/src/comgr-env.cpp index 893acc823ec25..842cc178e2103 100644 --- a/amd/comgr/src/comgr-env.cpp +++ b/amd/comgr/src/comgr-env.cpp @@ -37,10 +37,6 @@ #include "llvm/ADT/Twine.h" #include "llvm/Support/VirtualFileSystem.h" -#include -#include -#include - using namespace llvm; namespace COMGR { @@ -51,6 +47,26 @@ bool shouldSaveTemps() { return SaveTemps && StringRef(SaveTemps) != "0"; } +bool shouldSaveLLVMTemps() { + static char *SaveTemps = getenv("AMD_COMGR_SAVE_LLVM_TEMPS"); + return SaveTemps && StringRef(SaveTemps) != "0"; +} + +std::optional shouldUseVFS() { + if (shouldSaveTemps()) + return false; + + static char *UseVFS = getenv("AMD_COMGR_USE_VFS"); + if (UseVFS) { + if (StringRef(UseVFS) == "0") + return false; + else if (StringRef(UseVFS) == "1") + return true; + } + + return std::nullopt; +} + std::optional getRedirectLogs() { static char *RedirectLogs = getenv("AMD_COMGR_REDIRECT_LOGS"); if (!RedirectLogs || StringRef(RedirectLogs) == "0") { @@ -278,5 +294,34 @@ StringRef getHIPPath() { return getDetector()->getHIPPath(); } StringRef getLLVMPath() { return getDetector()->getLLVMPath(); } +StringRef getCachePolicy() { + static const char *EnvCachePolicy = std::getenv("AMD_COMGR_CACHE_POLICY"); + return EnvCachePolicy; +} + +StringRef getCacheDirectory() { + // By default the cache is enabled + static const char *Enable = std::getenv("AMD_COMGR_CACHE"); + bool CacheDisabled = StringRef(Enable) == "0"; + if (CacheDisabled) + return ""; + + StringRef EnvCacheDirectory = std::getenv("AMD_COMGR_CACHE_DIR"); + if (!EnvCacheDirectory.empty()) + return EnvCacheDirectory; + + // mark Result as static to keep it cached across calls + static SmallString<256> Result; + if (!Result.empty()) + return Result; + + if (sys::path::cache_directory(Result)) { + sys::path::append(Result, "comgr"); + return Result; + } + + return ""; +} + } // namespace env } // namespace COMGR diff --git a/amd/comgr/src/comgr-env.h b/amd/comgr/src/comgr-env.h index 7ca644e754e5d..203f13a886c1d 100644 --- a/amd/comgr/src/comgr-env.h +++ b/amd/comgr/src/comgr-env.h @@ -43,6 +43,8 @@ namespace env { /// Return whether the environment requests temps be saved. bool shouldSaveTemps(); +bool shouldSaveLLVMTemps(); +std::optional shouldUseVFS(); /// If the environment requests logs be redirected, return the string identifier /// of where to redirect. Otherwise return @p None. @@ -66,6 +68,15 @@ llvm::StringRef getHIPPath(); /// otherwise return the default LLVM path. llvm::StringRef getLLVMPath(); +/// If environment variable AMD_COMGR_CACHE_POLICY is set, return the +/// environment variable, otherwise return empty +llvm::StringRef getCachePolicy(); + +/// If environment variable AMD_COMGR_CACHE_DIR is set, return the environment +/// variable, otherwise return the default path: On Linux it's typically +/// $HOME/.cache/comgr_cache (depends on XDG_CACHE_HOME) +llvm::StringRef getCacheDirectory(); + } // namespace env } // namespace COMGR diff --git a/amd/comgr/src/comgr-isa-metadata.def b/amd/comgr/src/comgr-isa-metadata.def index 0f7354f6714c1..c926978c0b07c 100644 --- a/amd/comgr/src/comgr-isa-metadata.def +++ b/amd/comgr/src/comgr-isa-metadata.def @@ -40,66 +40,65 @@ /* #define HANDLE_ISA(TARGET_TRIPLE, PROCESSOR, \ SRAMECC_SUPPORTED, XNACK_SUPPORTED, \ - ELF_MACHINE, TRAP_HANDLER_ENABLED, LDS_SIZE, LDS_BANK_COUNT, \ + ELF_MACHINE, TRAP_HANDLER_ENABLED, IMAGE_SUPPORT, \ + LDS_BANK_COUNT, LDS_SIZE, \ EUS_PER_CU, MAX_WAVES_PER_CU, MAX_FLAT_WORK_GROUP_SIZE, \ SGPR_ALLOC_GRANULE, TOTAL_NUM_SGPRS, ADDRESSABLE_NUM_SGPRS, \ VGPR_ALLOC_GRANULE, TOTAL_NUM_VGPRS, ADDRESSABLE_NUM_VGPRS) \ - ----LDS--- ----CU--- WG ------SGPR----- ------VGPR----- - TARGET_TRIPLE PROCESSOR SRAMECC XNACK ELF_MACHINE TRAP Size Bnks EUs Waves Max Alloc Max Addr Alloc Max Addr */ -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx600", false, false, EF_AMDGPU_MACH_AMDGCN_GFX600, true, 65536, 32, 4, 40, 1024, 8, 512, 104, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx601", false, false, EF_AMDGPU_MACH_AMDGCN_GFX601, true, 65536, 32, 4, 40, 1024, 8, 512, 104, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx602", false, false, EF_AMDGPU_MACH_AMDGCN_GFX602, true, 65536, 32, 4, 40, 1024, 8, 512, 104, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx700", false, false, EF_AMDGPU_MACH_AMDGCN_GFX700, true, 65536, 32, 4, 40, 1024, 8, 512, 104, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx701", false, false, EF_AMDGPU_MACH_AMDGCN_GFX701, true, 65536, 32, 4, 40, 1024, 8, 512, 104, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx702", false, false, EF_AMDGPU_MACH_AMDGCN_GFX702, true, 65536, 16, 4, 40, 1024, 8, 512, 104, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx703", false, false, EF_AMDGPU_MACH_AMDGCN_GFX703, true, 65536, 16, 4, 40, 1024, 8, 512, 104, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx704", false, false, EF_AMDGPU_MACH_AMDGCN_GFX704, true, 65536, 16, 4, 40, 1024, 8, 512, 104, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx705", false, false, EF_AMDGPU_MACH_AMDGCN_GFX705, true, 65536, 16, 4, 40, 1024, 8, 512, 104, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx801", false, true, EF_AMDGPU_MACH_AMDGCN_GFX801, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx802", false, false, EF_AMDGPU_MACH_AMDGCN_GFX802, true, 65536, 32, 4, 40, 1024, 16, 800, 96, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx803", false, false, EF_AMDGPU_MACH_AMDGCN_GFX803, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx805", false, false, EF_AMDGPU_MACH_AMDGCN_GFX805, true, 65536, 32, 4, 40, 1024, 16, 800, 96, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx810", false, true, EF_AMDGPU_MACH_AMDGCN_GFX810, true, 65536, 16, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx900", false, true, EF_AMDGPU_MACH_AMDGCN_GFX900, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx902", false, true, EF_AMDGPU_MACH_AMDGCN_GFX902, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx904", false, true, EF_AMDGPU_MACH_AMDGCN_GFX904, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx906", true, true, EF_AMDGPU_MACH_AMDGCN_GFX906, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx908", true, true, EF_AMDGPU_MACH_AMDGCN_GFX908, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx909", false, true, EF_AMDGPU_MACH_AMDGCN_GFX909, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx90a", true, true, EF_AMDGPU_MACH_AMDGCN_GFX90A, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx90c", false, true, EF_AMDGPU_MACH_AMDGCN_GFX90C, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx940", true, true, EF_AMDGPU_MACH_AMDGCN_GFX940, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx941", true, true, EF_AMDGPU_MACH_AMDGCN_GFX941, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx942", true, true, EF_AMDGPU_MACH_AMDGCN_GFX942, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx950", true, true, EF_AMDGPU_MACH_AMDGCN_GFX950, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1010", false, true, EF_AMDGPU_MACH_AMDGCN_GFX1010, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1011", false, true, EF_AMDGPU_MACH_AMDGCN_GFX1011, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1012", false, true, EF_AMDGPU_MACH_AMDGCN_GFX1012, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1013", false, true, EF_AMDGPU_MACH_AMDGCN_GFX1013, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1030", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1030, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1031", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1031, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1032", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1032, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1033", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1033, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1034", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1034, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1035", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1035, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1036", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1036, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1100", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1100, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 24, 1536, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1101", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1101, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 24, 1536, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1102", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1102, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 16, 1024, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1103", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1103, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 16, 1024, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1150", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1150, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 16, 1024, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1151", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1151, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 24, 1536, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1152", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1152, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 16, 1024, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1153", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1153, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 16, 1024, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1200", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1200, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 24, 1536, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1201", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1201, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 24, 1536, 256) + ---LDS--- ----CU--- WG ------SGPR----- ------VGPR----- + TARGET_TRIPLE PROCESSOR SRAMECC XNACK ELF_MACHINE TRAP IMAGE Size Bnks EUs Waves Max Alloc Max Addr Alloc Max Addr */ +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx600", false, false, EF_AMDGPU_MACH_AMDGCN_GFX600, true, true, 65536, 32, 4, 40, 1024, 8, 512, 104, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx601", false, false, EF_AMDGPU_MACH_AMDGCN_GFX601, true, true, 65536, 32, 4, 40, 1024, 8, 512, 104, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx602", false, false, EF_AMDGPU_MACH_AMDGCN_GFX602, true, true, 65536, 32, 4, 40, 1024, 8, 512, 104, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx700", false, false, EF_AMDGPU_MACH_AMDGCN_GFX700, true, true, 65536, 32, 4, 40, 1024, 8, 512, 104, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx701", false, false, EF_AMDGPU_MACH_AMDGCN_GFX701, true, true, 65536, 32, 4, 40, 1024, 8, 512, 104, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx702", false, false, EF_AMDGPU_MACH_AMDGCN_GFX702, true, true, 65536, 16, 4, 40, 1024, 8, 512, 104, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx703", false, false, EF_AMDGPU_MACH_AMDGCN_GFX703, true, true, 65536, 16, 4, 40, 1024, 8, 512, 104, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx704", false, false, EF_AMDGPU_MACH_AMDGCN_GFX704, true, true, 65536, 16, 4, 40, 1024, 8, 512, 104, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx705", false, false, EF_AMDGPU_MACH_AMDGCN_GFX705, true, true, 65536, 16, 4, 40, 1024, 8, 512, 104, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx801", false, true, EF_AMDGPU_MACH_AMDGCN_GFX801, true, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx802", false, false, EF_AMDGPU_MACH_AMDGCN_GFX802, true, true, 65536, 32, 4, 40, 1024, 16, 800, 96, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx803", false, false, EF_AMDGPU_MACH_AMDGCN_GFX803, true, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx805", false, false, EF_AMDGPU_MACH_AMDGCN_GFX805, true, true, 65536, 32, 4, 40, 1024, 16, 800, 96, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx810", false, true, EF_AMDGPU_MACH_AMDGCN_GFX810, true, true, 65536, 16, 4, 40, 1024, 16, 800, 102, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx900", false, true, EF_AMDGPU_MACH_AMDGCN_GFX900, true, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx902", false, true, EF_AMDGPU_MACH_AMDGCN_GFX902, true, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx904", false, true, EF_AMDGPU_MACH_AMDGCN_GFX904, true, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx906", true, true, EF_AMDGPU_MACH_AMDGCN_GFX906, true, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx908", true, true, EF_AMDGPU_MACH_AMDGCN_GFX908, true, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx909", false, true, EF_AMDGPU_MACH_AMDGCN_GFX909, true, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx90a", true, true, EF_AMDGPU_MACH_AMDGCN_GFX90A, true, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 8, 512, 512) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx90c", false, true, EF_AMDGPU_MACH_AMDGCN_GFX90C, true, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx942", true, true, EF_AMDGPU_MACH_AMDGCN_GFX942, true, false, 65536, 32, 4, 40, 1024, 16, 800, 102, 8, 512, 512) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx950", true, true, EF_AMDGPU_MACH_AMDGCN_GFX950, true, false, 65536, 32, 4, 40, 1024, 16, 800, 102, 8, 512, 512) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1010", false, true, EF_AMDGPU_MACH_AMDGCN_GFX1010, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1011", false, true, EF_AMDGPU_MACH_AMDGCN_GFX1011, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1012", false, true, EF_AMDGPU_MACH_AMDGCN_GFX1012, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1013", false, true, EF_AMDGPU_MACH_AMDGCN_GFX1013, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1030", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1030, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1031", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1031, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1032", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1032, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1033", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1033, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1034", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1034, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1035", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1035, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1036", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1036, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1100", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1100, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 24, 1536, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1101", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1101, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 24, 1536, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1102", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1102, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 16, 1024, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1103", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1103, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 16, 1024, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1150", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1150, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 16, 1024, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1151", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1151, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 24, 1536, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1152", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1152, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 16, 1024, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1153", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1153, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 16, 1024, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1200", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1200, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 24, 1536, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx1201", false, false, EF_AMDGPU_MACH_AMDGCN_GFX1201, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 24, 1536, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx9-generic", false, true, EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx9-4-generic", true, true, EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx10-1-generic", false, true, EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx10-3-generic", false, false, EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx11-generic", false, false, EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 16, 1024, 256) -HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx12-generic", false, false, EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 24, 1536, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx9-generic", false, true, EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, true, true, 65536, 32, 4, 40, 1024, 16, 800, 102, 4, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx9-4-generic", true, true, EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, true, false, 65536, 32, 4, 40, 1024, 16, 800, 102, 8, 512, 512) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx10-1-generic", false, true, EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx10-3-generic", false, false, EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 8, 256, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx11-generic", false, false, EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 16, 1024, 256) +HANDLE_ISA("amdgcn-amd-amdhsa-", "gfx12-generic", false, false, EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC, true, true, 65536, 32, 4, 40, 1024, 106, 800, 106, 24, 1536, 256) #undef HANDLE_ISA diff --git a/amd/comgr/src/comgr-metadata.cpp b/amd/comgr/src/comgr-metadata.cpp index 450ea15415530..a89786ab8e498 100644 --- a/amd/comgr/src/comgr-metadata.cpp +++ b/amd/comgr/src/comgr-metadata.cpp @@ -357,6 +357,7 @@ struct IsaInfo { bool XnackSupported; unsigned ElfMachine; bool TrapHandlerEnabled; + bool ImageSupport; unsigned LDSSize; unsigned LDSBankCount; unsigned EUsPerCU; @@ -371,16 +372,17 @@ struct IsaInfo { } IsaInfos[] = { #define HANDLE_ISA(TARGET_TRIPLE, PROCESSOR, SRAMECC_SUPPORTED, \ XNACK_SUPPORTED, ELF_MACHINE, TRAP_HANDLER_ENABLED, \ - LDS_SIZE, LDS_BANK_COUNT, EUS_PER_CU, MAX_WAVES_PER_CU, \ - MAX_FLAT_WORK_GROUP_SIZE, SGPR_ALLOC_GRANULE, \ - TOTAL_NUM_SGPRS, ADDRESSABLE_NUM_SGPRS, VGPR_ALLOC_GRANULE, \ - TOTAL_NUM_VGPRS, ADDRESSABLE_NUM_VGPRS) \ + IMAGE_SUPPORT, LDS_SIZE, LDS_BANK_COUNT, EUS_PER_CU, \ + MAX_WAVES_PER_CU, MAX_FLAT_WORK_GROUP_SIZE, \ + SGPR_ALLOC_GRANULE, TOTAL_NUM_SGPRS, ADDRESSABLE_NUM_SGPRS, \ + VGPR_ALLOC_GRANULE, TOTAL_NUM_VGPRS, ADDRESSABLE_NUM_VGPRS) \ {TARGET_TRIPLE "-" PROCESSOR, \ PROCESSOR, \ SRAMECC_SUPPORTED, \ XNACK_SUPPORTED, \ ELF::ELF_MACHINE, \ TRAP_HANDLER_ENABLED, \ + IMAGE_SUPPORT, \ LDS_SIZE, \ LDS_BANK_COUNT, \ EUS_PER_CU, \ @@ -880,6 +882,8 @@ amd_comgr_status_t getIsaMetadata(StringRef IsaName, auto Info = IsaInfos[IsaIndex]; Root["TrapHandlerEnabled"] = Doc.getNode(std::to_string(Info.TrapHandlerEnabled), /*Copy=*/true); + Root["ImageSupport"] = + Doc.getNode(std::to_string(Info.ImageSupport), /*Copy=*/true); Root["LocalMemorySize"] = Doc.getNode(std::to_string(Info.LDSSize), /*Copy=*/true); Root["EUsPerCU"] = Doc.getNode(std::to_string(Info.EUsPerCU), /*Copy=*/true); diff --git a/amd/comgr/src/comgr-spirv-command.cpp b/amd/comgr/src/comgr-spirv-command.cpp new file mode 100644 index 0000000000000..6f32a035c0e99 --- /dev/null +++ b/amd/comgr/src/comgr-spirv-command.cpp @@ -0,0 +1,113 @@ +/******************************************************************************* + * + * University of Illinois/NCSA + * Open Source License + * + * Copyright (c) 2003-2017 University of Illinois at Urbana-Champaign. + * Modifications (c) 2018 Advanced Micro Devices, Inc. + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the names of the LLVM Team, University of Illinois at + * Urbana-Champaign, nor the names of its contributors may be used to + * endorse or promote products derived from this Software without specific + * prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + ******************************************************************************/ + +#include "comgr-spirv-command.h" + +#ifndef COMGR_DISABLE_SPIRV +#include "comgr-diagnostic-handler.h" + +#include +#include +#include + +#include +#endif + +namespace COMGR { +using namespace llvm; +Error SPIRVCommand::writeExecuteOutput(StringRef CachedBuffer) { + assert(OutputBuffer.empty()); + OutputBuffer.reserve(CachedBuffer.size()); + OutputBuffer.insert(OutputBuffer.end(), CachedBuffer.begin(), + CachedBuffer.end()); + return Error::success(); +} + +Expected SPIRVCommand::readExecuteOutput() { + return StringRef(OutputBuffer.data(), OutputBuffer.size()); +} + +amd_comgr_status_t SPIRVCommand::execute(raw_ostream &LogS) { +#ifndef COMGR_DISABLE_SPIRV + LLVMContext Context; + Context.setDiagnosticHandler( + std::make_unique(LogS), true); + + // TODO: With C++23, we should investigate replacing with spanstream + // to avoid memory copies: + // https://en.cppreference.com/w/cpp/io/basic_ispanstream + std::istringstream ISS(std::string(InputBuffer.data(), InputBuffer.size())); + + Module *M; + std::string Err; + + SPIRV::TranslatorOpts Opts; + Opts.enableAllExtensions(); + Opts.setDesiredBIsRepresentation(SPIRV::BIsRepresentation::OpenCL20); + + if (!readSpirv(Context, Opts, ISS, M, Err)) { + LogS << "Failed to load SPIR-V as LLVM Module: " << Err << '\n'; + return AMD_COMGR_STATUS_ERROR; + } + + BitcodeWriter Writer(OutputBuffer); + Writer.writeModule(*M, false, nullptr, false, nullptr); + Writer.writeSymtab(); + Writer.writeStrtab(); + return AMD_COMGR_STATUS_SUCCESS; +#else + return AMD_COMGR_STATUS_ERROR; +#endif +} + +SPIRVCommand::ActionClass SPIRVCommand::getClass() const { + // return an action class that is not allocated to distinguish it from any + // clang action + return clang::driver::Action::ActionClass::JobClassLast + 1; +} + +void SPIRVCommand::addOptionsIdentifier(HashAlgorithm &) const { + // do nothing, there are no options + return; +} + +Error SPIRVCommand::addInputIdentifier(HashAlgorithm &H) const { + addString(H, InputBuffer); + return Error::success(); +} +} // namespace COMGR diff --git a/amd/comgr/src/comgr-spirv-command.h b/amd/comgr/src/comgr-spirv-command.h new file mode 100644 index 0000000000000..20e4a503c3610 --- /dev/null +++ b/amd/comgr/src/comgr-spirv-command.h @@ -0,0 +1,69 @@ +/******************************************************************************* + * + * University of Illinois/NCSA + * Open Source License + * + * Copyright (c) 2003-2017 University of Illinois at Urbana-Champaign. + * Modifications (c) 2018 Advanced Micro Devices, Inc. + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the names of the LLVM Team, University of Illinois at + * Urbana-Champaign, nor the names of its contributors may be used to + * endorse or promote products derived from this Software without specific + * prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + ******************************************************************************/ + +#ifndef COMGR_SPIRV_COMMAND_H +#define COMGR_SPIRV_COMMAND_H + +#include "comgr-cache-command.h" +#include "comgr.h" + +namespace COMGR { +class SPIRVCommand : public CachedCommandAdaptor { +public: + llvm::StringRef InputBuffer; + llvm::SmallVectorImpl &OutputBuffer; + +public: + SPIRVCommand(DataObject *Input, llvm::SmallVectorImpl &OutputBuffer) + : InputBuffer(Input->Data, Input->Size), OutputBuffer(OutputBuffer) {} + + bool canCache() const final { return true; } + llvm::Error writeExecuteOutput(llvm::StringRef CachedBuffer) final; + llvm::Expected readExecuteOutput() final; + amd_comgr_status_t execute(llvm::raw_ostream &LogS) final; + + ~SPIRVCommand() override = default; + +protected: + ActionClass getClass() const override; + void addOptionsIdentifier(HashAlgorithm &) const override; + llvm::Error addInputIdentifier(HashAlgorithm &) const override; +}; +} // namespace COMGR + +#endif diff --git a/amd/comgr/src/comgr.cpp b/amd/comgr/src/comgr.cpp index 55d8c7f2c88a6..053c03017d53b 100644 --- a/amd/comgr/src/comgr.cpp +++ b/amd/comgr/src/comgr.cpp @@ -187,6 +187,8 @@ amd_comgr_status_t dispatchCompilerAction(amd_comgr_action_kind_t ActionKind, return Compiler.compileToBitcode(true); case AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE: return Compiler.compileToExecutable(); + case AMD_COMGR_ACTION_COMPILE_SPIRV_TO_RELOCATABLE: + return Compiler.compileSpirvToRelocatable(); case AMD_COMGR_ACTION_TRANSLATE_SPIRV_TO_BC: return Compiler.translateSpirvToBitcode(); @@ -291,6 +293,8 @@ StringRef getActionKindName(amd_comgr_action_kind_t ActionKind) { return "AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE"; case AMD_COMGR_ACTION_UNBUNDLE: return "AMD_COMGR_ACTION_UNBUNDLE"; + case AMD_COMGR_ACTION_COMPILE_SPIRV_TO_RELOCATABLE: + return "AMD_COMGR_ACTION_COMPILE_SPIRV_TO_RELOCATABLE"; case AMD_COMGR_ACTION_TRANSLATE_SPIRV_TO_BC: return "AMD_COMGR_ACTION_TRANSLATE_SPIRV_TO_BC"; } @@ -317,6 +321,10 @@ amd_comgr_status_t COMGR::setCStr(char *&Dest, StringRef Src, size_t *Size) { return AMD_COMGR_STATUS_SUCCESS; } +StringRef COMGR::getComgrHashIdentifier() { + return xstringify(AMD_COMGR_VERSION_ID); +} + amd_comgr_status_t COMGR::parseTargetIdentifier(StringRef IdentStr, TargetIdentifier &Ident) { SmallVector IsaNameComponents; @@ -336,8 +344,17 @@ amd_comgr_status_t COMGR::parseTargetIdentifier(StringRef IdentStr, Ident.Processor = Ident.Features[0]; Ident.Features.erase(Ident.Features.begin()); - size_t IsaIndex; + // TODO: Add a LIT test for this + if (IdentStr == "amdgcn-amd-amdhsa--amdgcnspirv" || + IdentStr == "amdgcn-amd-amdhsa-unknown-amdgcnspirv") { + // Features not supported for SPIR-V + if (!Ident.Features.empty()) + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + return AMD_COMGR_STATUS_SUCCESS; + } + + size_t IsaIndex; amd_comgr_status_t Status = metadata::getIsaIndex(IdentStr, IsaIndex); if (Status != AMD_COMGR_STATUS_SUCCESS) { return Status; @@ -1009,6 +1026,10 @@ amd_comgr_status_t AMD_COMGR_API return AMD_COMGR_STATUS_SUCCESS; } + if (StringRef(IsaName) == "amdgcn-amd-amdhsa--amdgcnspirv") { + return ActionP->setIsaName(IsaName); + } + if (!metadata::isValidIsaName(IsaName)) { return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; } @@ -1182,6 +1203,22 @@ amd_comgr_status_t AMD_COMGR_API return ActionP->setBundleEntryIDs(ArrayRef(EntryIDs, Count)); } +amd_comgr_status_t AMD_COMGR_API + // NOLINTNEXTLINE(readability-identifier-naming) + amd_comgr_action_info_set_vfs + // + (amd_comgr_action_info_t ActionInfo, bool ShouldUseVFS) { + DataAction *ActionP = DataAction::convert(ActionInfo); + + if (!ActionP) { + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + + ActionP->ShouldUseVFS = ShouldUseVFS; + + return AMD_COMGR_STATUS_SUCCESS; +} + amd_comgr_status_t AMD_COMGR_API // NOLINTNEXTLINE(readability-identifier-naming) amd_comgr_action_info_set_device_lib_linking @@ -1372,6 +1409,7 @@ amd_comgr_status_t AMD_COMGR_API case AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE: case AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC: case AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE: + case AMD_COMGR_ACTION_COMPILE_SPIRV_TO_RELOCATABLE: case AMD_COMGR_ACTION_TRANSLATE_SPIRV_TO_BC: ActionStatus = dispatchCompilerAction(ActionKind, ActionInfoP, InputSetP, ResultSetP, *LogP); @@ -2104,6 +2142,8 @@ amd_comgr_populate_name_expression_map(amd_comgr_data_t Data, size_t *Count) { if (!RelaRangeOrError) { llvm::logAllUnhandledErrors(RelaRangeOrError.takeError(), llvm::errs(), "RelaRange creation error: "); + for (auto *Ptr : NameExpDataVec) + delete Ptr; return AMD_COMGR_STATUS_ERROR; } auto RelaRange = std::move(RelaRangeOrError.get()); @@ -2124,6 +2164,8 @@ amd_comgr_populate_name_expression_map(amd_comgr_data_t Data, size_t *Count) { if (!RodataOrError) { llvm::logAllUnhandledErrors(RodataOrError.takeError(), llvm::errs(), "Rodata creation error: "); + for (auto *Ptr : NameExpDataVec) + delete Ptr; return AMD_COMGR_STATUS_ERROR; } auto Rodata = std::move(RodataOrError.get()); @@ -2154,6 +2196,8 @@ amd_comgr_populate_name_expression_map(amd_comgr_data_t Data, size_t *Count) { } } + for (auto *Ptr : NameExpDataVec) + delete Ptr; } // end AMD_COMGR_DATA_KIND_EXECUTABLE conditional *Count = DataP->NameExpressionMap.size(); diff --git a/amd/comgr/src/comgr.h b/amd/comgr/src/comgr.h index 04d8a172f4280..bacfadc986b43 100644 --- a/amd/comgr/src/comgr.h +++ b/amd/comgr/src/comgr.h @@ -131,6 +131,7 @@ struct DataObject { DataSymbol *DataSym; std::vector MangledNames; std::map NameExpressionMap; + llvm::SmallVector SpirvFlags; private: std::unique_ptr Buffer; @@ -230,6 +231,7 @@ struct DataAction { amd_comgr_language_t Language; bool Logging; bool ShouldLinkDeviceLibs = false; + bool ShouldUseVFS = true; std::vector BundleEntryIDs; @@ -315,6 +317,10 @@ struct NameExpressionData { long unsigned int RodataOffset; }; +// get a string identifying comgr: this is a combination of comgr's version, +// device-libs contents and opencl-c.h contents. +llvm::StringRef getComgrHashIdentifier(); + } // namespace COMGR #endif // header guard diff --git a/amd/comgr/src/exportmap.in b/amd/comgr/src/exportmap.in index 17ab22512c9f9..88a96ed562515 100644 --- a/amd/comgr/src/exportmap.in +++ b/amd/comgr/src/exportmap.in @@ -90,3 +90,7 @@ global: amd_comgr_action_info_set_bundle_entry_ids; @amd_comgr_NAME@_2.9 { global: amd_comgr_action_info_set_device_lib_linking; } @amd_comgr_NAME@_2.8; + +@amd_comgr_NAME@3.1 { +global: amd_comgr_action_info_set_vfs; +} @amd_comgr_NAME@_2.9; diff --git a/amd/comgr/test-lit/CMakeLists.txt b/amd/comgr/test-lit/CMakeLists.txt index aabcd1aa8a21a..c98dccef6cdd2 100644 --- a/amd/comgr/test-lit/CMakeLists.txt +++ b/amd/comgr/test-lit/CMakeLists.txt @@ -19,6 +19,7 @@ if (NOT DEFINED LLVM_LIT_PATH) set(LLVM_LIT_PATH "${LLVM_TOOLS_BINARY_DIR}/llvm-lit") endif() endif() +message("--LLVM_LIT_PATH: ${LLVM_LIT_PATH}") add_custom_target(test-lit COMMAND "${LLVM_LIT_PATH}" "${CMAKE_CURRENT_BINARY_DIR}" -v) @@ -37,5 +38,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) add_comgr_lit_binary(source-to-bc-with-dev-libs) add_comgr_lit_binary(spirv-translator) +add_comgr_lit_binary(compile-minimal-test) +add_comgr_lit_binary(spirv-to-reloc) add_dependencies(check-comgr test-lit) diff --git a/amd/comgr/test-lit/comgr-sources/compile-minimal-test.c b/amd/comgr/test-lit/comgr-sources/compile-minimal-test.c new file mode 100644 index 0000000000000..eca64e9a520d6 --- /dev/null +++ b/amd/comgr/test-lit/comgr-sources/compile-minimal-test.c @@ -0,0 +1,141 @@ +/******************************************************************************* + * + * University of Illinois/NCSA + * Open Source License + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * with the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the names of Advanced Micro Devices, Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this Software without specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH + * THE SOFTWARE. + * + ******************************************************************************/ + +#include "amd_comgr.h" +#include "common.h" +#include +#include +#include + +int main(int argc, char *argv[]) { + char *BufSource; + size_t SizeSource; + amd_comgr_data_t DataSource; + amd_comgr_data_set_t DataSetIn, DataSetBc, DataSetLinked, DataSetReloc, + DataSetExec; + amd_comgr_action_info_t DataAction; + size_t Count; + const char *CodeGenOptions[] = {"-mllvm", "--color"}; + size_t CodeGenOptionsCount = + sizeof(CodeGenOptions) / sizeof(CodeGenOptions[0]); + + SizeSource = setBuf(argv[1], &BufSource); + + amd_comgr_(create_data_set(&DataSetIn)); + amd_comgr_(create_data(AMD_COMGR_DATA_KIND_SOURCE, &DataSource)); + amd_comgr_(set_data(DataSource, SizeSource, BufSource)); + amd_comgr_(set_data_name(DataSource, "source1.cl")); + amd_comgr_(data_set_add(DataSetIn, DataSource)); + + amd_comgr_(create_action_info(&DataAction)); + amd_comgr_( + action_info_set_language(DataAction, AMD_COMGR_LANGUAGE_OPENCL_1_2)); + amd_comgr_(action_info_set_isa_name(DataAction, "amdgcn-amd-amdhsa--gfx900")); + amd_comgr_(action_info_set_option_list(DataAction, CodeGenOptions, + CodeGenOptionsCount)); + amd_comgr_(create_data_set(&DataSetBc)); + amd_comgr_(do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, DataAction, + DataSetIn, DataSetBc)); + amd_comgr_(action_data_count(DataSetBc, AMD_COMGR_DATA_KIND_BC, &Count)); + + if (Count != 1) { + printf("AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC Failed: " + "produced %zu BC objects (expected 1)\n", + Count); + exit(1); + } + + amd_comgr_(create_data_set(&DataSetLinked)); + + amd_comgr_(do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, DataAction, DataSetBc, + DataSetLinked)); + amd_comgr_(action_data_count(DataSetLinked, AMD_COMGR_DATA_KIND_BC, &Count)); + if (Count != 1) { + printf("AMD_COMGR_ACTION_LINK_BC_TO_BC Failed: " + "produced %zu BC objects (expected 1)\n", + Count); + exit(1); + } + + amd_comgr_(create_data_set(&DataSetReloc)); + + amd_comgr_(action_info_set_device_lib_linking(DataAction, true)); + + amd_comgr_(do_action(AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE, DataAction, + DataSetLinked, DataSetReloc)); + + amd_comgr_( + action_data_count(DataSetReloc, AMD_COMGR_DATA_KIND_RELOCATABLE, &Count)); + + if (Count != 1) { + printf("AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE Failed: " + "produced %zu source objects (expected 1)\n", + Count); + exit(1); + } + + amd_comgr_(create_data_set(&DataSetExec)); + + amd_comgr_(action_info_set_option_list(DataAction, NULL, 0)); + + amd_comgr_(do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, + DataAction, DataSetReloc, DataSetExec)); + + amd_comgr_( + action_data_count(DataSetExec, AMD_COMGR_DATA_KIND_EXECUTABLE, &Count)); + + if (Count != 1) { + printf("AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE Failed: " + "produced %zu executable objects (expected 1)\n", + Count); + exit(1); + } + + amd_comgr_data_t DataExec; + amd_comgr_(action_data_get_data(DataSetExec, AMD_COMGR_DATA_KIND_EXECUTABLE, + 0, &DataExec)); + dumpData(DataExec, argv[2]); + + amd_comgr_(release_data(DataSource)); + amd_comgr_(release_data(DataExec)); + amd_comgr_(destroy_data_set(DataSetIn)); + amd_comgr_(destroy_data_set(DataSetBc)); + amd_comgr_(destroy_data_set(DataSetLinked)); + amd_comgr_(destroy_data_set(DataSetReloc)); + amd_comgr_(destroy_data_set(DataSetExec)); + amd_comgr_(destroy_action_info(DataAction)); + free(BufSource); + return 0; +} diff --git a/amd/comgr/test-lit/comgr-sources/source-to-bc-with-dev-libs.c b/amd/comgr/test-lit/comgr-sources/source-to-bc-with-dev-libs.c index bdf0c493bf600..d7d793040c627 100644 --- a/amd/comgr/test-lit/comgr-sources/source-to-bc-with-dev-libs.c +++ b/amd/comgr/test-lit/comgr-sources/source-to-bc-with-dev-libs.c @@ -14,8 +14,9 @@ int main(int argc, char *argv[]) { "-mcode-object-version=5", "-mllvm", "-amdgpu-prelink"}; size_t CodeGenOptionsCount = sizeof(CodeGenOptions) / sizeof(CodeGenOptions[0]); - if (argc != 4) { - fprintf(stderr, "Usage: source-to-bc-with-device-libs file.cl -o file.bc\n"); + if (argc < 4 || argc > 5) { + fprintf(stderr, "Usage: source-to-bc-with-device-libs file.cl " + "[--vfs|--novfs] -o file.bc\n"); exit(1); } @@ -33,6 +34,12 @@ int main(int argc, char *argv[]) { amd_comgr_(action_info_set_isa_name(DataAction, "amdgcn-amd-amdhsa--gfx900")); amd_comgr_(create_data_set(&DataSetPch)); + if (!strncmp(argv[2], "--vfs", 5)) { + amd_comgr_(action_info_set_vfs(DataAction, true)); + } else if (!strncmp(argv[2], "--novfs", 7)) { + amd_comgr_(action_info_set_vfs(DataAction, false)); + } + amd_comgr_(do_action(AMD_COMGR_ACTION_ADD_PRECOMPILED_HEADERS, DataAction, DataSetIn, DataSetPch)); @@ -63,9 +70,9 @@ int main(int argc, char *argv[]) { } amd_comgr_data_t DataBc; - amd_comgr_(action_data_get_data(DataSetBc, AMD_COMGR_DATA_KIND_BC, 0, - &DataBc)); - dumpData(DataBc, argv[3]); + amd_comgr_( + action_data_get_data(DataSetBc, AMD_COMGR_DATA_KIND_BC, 0, &DataBc)); + dumpData(DataBc, argv[argc - 1]); amd_comgr_(release_data(DataSource)); amd_comgr_(release_data(DataBc)); diff --git a/amd/comgr/test-lit/comgr-sources/spirv-to-reloc.c b/amd/comgr/test-lit/comgr-sources/spirv-to-reloc.c new file mode 100644 index 0000000000000..1056515b052e2 --- /dev/null +++ b/amd/comgr/test-lit/comgr-sources/spirv-to-reloc.c @@ -0,0 +1,57 @@ +#include "amd_comgr.h" +#include "common.h" +#include +#include +#include + +int main(int argc, char *argv[]) { + char *BufSpv; + size_t SizeSpv; + amd_comgr_data_t DataSpv; + amd_comgr_data_set_t DataSetSpv, DataSetReloc; + amd_comgr_action_info_t DataAction; + size_t Count; + + if (argc != 3) { + fprintf(stderr, "Usage: spirv-to-reloc file.spv file.o\n"); + exit(1); + } + + SizeSpv = setBuf(argv[1], &BufSpv); + + amd_comgr_(create_data(AMD_COMGR_DATA_KIND_SPIRV, &DataSpv)); + amd_comgr_(set_data(DataSpv, SizeSpv, BufSpv)); + amd_comgr_(set_data_name(DataSpv, "file.spv")); + + amd_comgr_(create_data_set(&DataSetSpv)); + amd_comgr_(data_set_add(DataSetSpv, DataSpv)); + + amd_comgr_(create_action_info(&DataAction)); + amd_comgr_(action_info_set_language(DataAction, AMD_COMGR_LANGUAGE_HIP)); + amd_comgr_(action_info_set_isa_name(DataAction, "amdgcn-amd-amdhsa--gfx900")); + + amd_comgr_(create_data_set(&DataSetReloc)); + amd_comgr_(do_action(AMD_COMGR_ACTION_COMPILE_SPIRV_TO_RELOCATABLE, + DataAction, DataSetSpv, DataSetReloc)); + + amd_comgr_( + action_data_count(DataSetReloc, AMD_COMGR_DATA_KIND_RELOCATABLE, &Count)); + + if (Count != 1) { + printf("AMD_COMGR_ACTION_COMPILE_SPIRV_TO_RELOCATABLE Failed: " + "produced %zu RELOCATABLE objects (expected 1)\n", + Count); + exit(1); + } + + amd_comgr_data_t DataReloc; + amd_comgr_(action_data_get_data(DataSetReloc, AMD_COMGR_DATA_KIND_RELOCATABLE, + 0, &DataReloc)); + dumpData(DataReloc, argv[2]); + + amd_comgr_(release_data(DataSpv)); + amd_comgr_(destroy_data_set(DataSetSpv)); + amd_comgr_(destroy_data_set(DataSetReloc)); + amd_comgr_(destroy_action_info(DataAction)); + free(BufSpv); +} diff --git a/amd/comgr/test-lit/compile-minimal-test-cached-bad-dir.cl b/amd/comgr/test-lit/compile-minimal-test-cached-bad-dir.cl new file mode 100644 index 0000000000000..45d589ddce7c4 --- /dev/null +++ b/amd/comgr/test-lit/compile-minimal-test-cached-bad-dir.cl @@ -0,0 +1,13 @@ +// RUN: export AMD_COMGR_CACHE=1 +// +// COM: fail to create the cache, but still produce something valid +// RUN: rm -f %t.log +// RUN: echo "not a directory" > %t.txt +// RUN: AMD_COMGR_CACHE_DIR=%t.txt \ +// RUN: AMD_COMGR_EMIT_VERBOSE_LOGS=1 \ +// RUN: AMD_COMGR_REDIRECT_LOGS=%t.log \ +// RUN: compile-minimal-test %S/compile-minimal-test.cl %t.bin +// RUN: llvm-objdump -d %t.bin | FileCheck %S/compile-minimal-test.cl +// RUN: FileCheck --check-prefix=BAD %s < %t.log +// BAD: Failed to open cache file +// BAD-SAME: Not a directory diff --git a/amd/comgr/test-lit/compile-minimal-test-cached-bad-policy.cl b/amd/comgr/test-lit/compile-minimal-test-cached-bad-policy.cl new file mode 100644 index 0000000000000..3a88e794a2902 --- /dev/null +++ b/amd/comgr/test-lit/compile-minimal-test-cached-bad-policy.cl @@ -0,0 +1,15 @@ +// RUN: export AMD_COMGR_CACHE=1 +// +// COM: fail to create the cache, but still produce something valid +// RUN: rm -f %t_log +// RUN: AMD_COMGR_CACHE_DIR=%t.cache \ +// RUN: AMD_COMGR_CACHE_POLICY="foo=2h" \ +// RUN: AMD_COMGR_EMIT_VERBOSE_LOGS=1 \ +// RUN: AMD_COMGR_REDIRECT_LOGS=%t.log \ +// RUN: compile-minimal-test %S/compile-minimal-test.cl %t.bin +// RUN: llvm-objdump -d %t.bin | FileCheck %S/compile-minimal-test.cl +// RUN: FileCheck --check-prefix=BAD %s < %t.log +// BAD: when parsing the cache policy: Unknown key: 'foo' +// +// COM: the cache has not been created since we couldn't parse the policy +// RUN: [ ! -d %t.cache ] diff --git a/amd/comgr/test-lit/compile-minimal-test-cached.cl b/amd/comgr/test-lit/compile-minimal-test-cached.cl new file mode 100644 index 0000000000000..b2f6a034be284 --- /dev/null +++ b/amd/comgr/test-lit/compile-minimal-test-cached.cl @@ -0,0 +1,28 @@ +// RUN: rm -fr %t.cache +// +// RUN: unset AMD_COMGR_CACHE +// RUN: AMD_COMGR_CACHE_DIR=%t.cache compile-minimal-test %S/compile-minimal-test.cl %t.bin +// RUN: llvm-objdump -d %t.bin | FileCheck %S/compile-minimal-test.cl +// RUN: [ -d %t.cache ] +// +// RUN: rm -fr %t.cache +// +// RUN: export AMD_COMGR_CACHE=0 +// RUN: AMD_COMGR_CACHE_DIR=%t.cache compile-minimal-test %S/compile-minimal-test.cl %t.bin +// RUN: llvm-objdump -d %t.bin | FileCheck %S/compile-minimal-test.cl +// RUN: [ ! -d %t.cache ] +// +// RUN: export AMD_COMGR_CACHE=1 +// +// COM: run once and check that the cache directory exists and it has more than 1 element (one for the cache tag, one or more for the cached commands) +// RUN: AMD_COMGR_CACHE_DIR=%t.cache compile-minimal-test %S/compile-minimal-test.cl %t_a.bin +// RUN: llvm-objdump -d %t_a.bin | FileCheck %S/compile-minimal-test.cl +// RUN: COUNT_BEFORE=$(ls "%t.cache" | wc -l) +// COM: One element for the tag, one for bc->obj another for obj->exec. No elements for src->bc since we currently not support it. +// RUN: [ 3 -eq $COUNT_BEFORE ] +// +// RUN: AMD_COMGR_CACHE_DIR=%t.cache compile-minimal-test %S/compile-minimal-test.cl %t_b.bin +// RUN: llvm-objdump -d %t_b.bin | FileCheck %S/compile-minimal-test.cl +// RUN: COUNT_AFTER=$(ls "%t.cache" | wc -l) +// RUN: [ $COUNT_AFTER = $COUNT_BEFORE ] +// diff --git a/amd/comgr/test-lit/compile-minimal-test.cl b/amd/comgr/test-lit/compile-minimal-test.cl new file mode 100644 index 0000000000000..d928560462dbe --- /dev/null +++ b/amd/comgr/test-lit/compile-minimal-test.cl @@ -0,0 +1,12 @@ +// COM: Run Comgr binary to compile OpenCL source into LLVM IR Bitcode, +// COM: And then generating an executable +// RUN: compile-minimal-test %s %t.bin + +// COM: Dissasemble +// RUN: llvm-objdump -d %t.bin | FileCheck %s +// CHECK: : +// CHECK: s_endpgm + +void kernel add(__global float *A, __global float *B, __global float *C) { + *C = *A + *B; +} diff --git a/amd/comgr/test-lit/lit.cfg.py b/amd/comgr/test-lit/lit.cfg.py index 15a0e9d507b9b..242c6448320a1 100644 --- a/amd/comgr/test-lit/lit.cfg.py +++ b/amd/comgr/test-lit/lit.cfg.py @@ -14,3 +14,7 @@ if not config.comgr_disable_spirv: config.available_features.add("comgr-has-spirv") + +# By default, disable the cache for the tests. +# Test for the cache must explicitly enable this variable. +config.environment['AMD_COMGR_CACHE'] = "0" diff --git a/amd/comgr/test-lit/spirv-to-reloc-debuginfo.hip b/amd/comgr/test-lit/spirv-to-reloc-debuginfo.hip new file mode 100644 index 0000000000000..f09a41eb2abd5 --- /dev/null +++ b/amd/comgr/test-lit/spirv-to-reloc-debuginfo.hip @@ -0,0 +1,53 @@ +// REQUIRES: comgr-has-spirv +// XFAIL: * + +// COM: Generate a debuginfo SPIR-V file from a HIP kernel +// RUN: clang -x hip --offload-arch=amdgcnspirv -nogpulib -nogpuinc \ +// RUN: --no-gpu-bundle-output --offload-device-only -O3 %s -o %t.dbg.spv -g + +// COM: Compile debuginfo SPIR-V source to a relocatable +// RUN: AMD_COMGR_EMIT_VERBOSE_LOGS=1 AMD_COMGR_REDIRECT_LOGS=stdout \ +// RUN: spirv-to-reloc %t.dbg.spv %t.dbg.o | FileCheck --dump-input-filter all \ +// RUN: -check-prefix=CHECK-DBG %s + +// COM: Check that debuginfo SPIR-V flags are correctly extracted +// CHECK-DBG: Driver Job Args: {{.*}} "-mllvm" "-amdgpu-spill-cfi-saved-regs" + +#include + +#define __constant__ __attribute__((constant)) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __host__ __attribute__((host)) +#define __shared__ __attribute__((shared)) +#define __managed__ __attribute__((managed)) +#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) + +struct dim3 { + unsigned x, y, z; + __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} +}; + +#ifdef __HIP__ +typedef struct hipStream *hipStream_t; +typedef enum hipError {} hipError_t; +int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize, + size_t sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, + hipStream_t stream); +#endif + +__attribute__((device)) +void clean_value(float* ptr) { *ptr = 0; } + +__attribute__((global)) +void add_value(float* a, float* b, float* res) { + *res = *a + *b; + + clean_value(a); +} diff --git a/amd/comgr/test-lit/spirv-to-reloc.hip b/amd/comgr/test-lit/spirv-to-reloc.hip new file mode 100644 index 0000000000000..40bf8fa6e6aca --- /dev/null +++ b/amd/comgr/test-lit/spirv-to-reloc.hip @@ -0,0 +1,59 @@ +// REQUIRES: comgr-has-spirv + +// COM: Generate a SPIR-V file from a HIP kernel +// RUN: clang -x hip --offload-arch=amdgcnspirv -nogpulib -nogpuinc \ +// RUN: --no-gpu-bundle-output --offload-device-only -O3 %s -o %t.spv \ +// RUN: -fvisibility=hidden -fno-autolink -fexceptions -fcolor-diagnostics + +// COM: Compile SPIR-V source to a relocatable +// RUN: AMD_COMGR_EMIT_VERBOSE_LOGS=1 AMD_COMGR_REDIRECT_LOGS=spirv-flags.txt \ +// RUN: spirv-to-reloc %t.spv %t.o + +// COM: Check that SPIR-V flags are correctly extracted +// RUN: grep '\-fvisibility=hidden' spirv-flags.txt +// RUN: grep '\-fno-autolink' spirv-flags.txt +// RUN: grep '\-fexceptions' spirv-flags.txt +// RUN: grep '\-fcolor-diagnostics' spirv-flags.txt +// RUN: grep '\-O3' spirv-flags.txt +// RUN: grep '\-mcode-object-version=5' spirv-flags.txt + +// RUN: rm spirv-flags.txt + +#include + +#define __constant__ __attribute__((constant)) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __host__ __attribute__((host)) +#define __shared__ __attribute__((shared)) +#define __managed__ __attribute__((managed)) +#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) + +struct dim3 { + unsigned x, y, z; + __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} +}; + +#ifdef __HIP__ +typedef struct hipStream *hipStream_t; +typedef enum hipError {} hipError_t; +int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize, + size_t sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, + hipStream_t stream); +#endif + +__attribute__((device)) +void clean_value(float* ptr) { *ptr = 0; } + +__attribute__((global)) +void add_value(float* a, float* b, float* res) { + *res = *a + *b; + + clean_value(a); +} diff --git a/amd/comgr/test-lit/spirv-translator-cached.cl b/amd/comgr/test-lit/spirv-translator-cached.cl new file mode 100644 index 0000000000000..8ff28dacd6071 --- /dev/null +++ b/amd/comgr/test-lit/spirv-translator-cached.cl @@ -0,0 +1,28 @@ +// REQUIRES: comgr-has-spirv +// COM: Same as spirv-translator but with the cache +// RUN: rm -fr %t.cache + +// COM: Generate a spirv-targeted LLVM IR file from an OpenCL kernel +// RUN: clang -c -emit-llvm --target=spirv64 %S/spirv-translator.cl -o %t.bc + +// COM: Translate LLVM IR to SPIRV format +// RUN: amd-llvm-spirv --spirv-target-env=CL2.0 %t.bc -o %t.spv + +// COM: Run Comgr Translator to covert SPIRV back to LLVM IR +// RUN: export AMD_COMGR_CACHE=1 +// RUN: AMD_COMGR_CACHE_DIR=%t.cache spirv-translator %t.spv -o %t.translated.bc +// RUN: COUNT=$(ls "%t.cache" | wc -l) +// RUN: [ 2 -eq $COUNT ] + +// COM: Run again and check that the cache contents haven't changed +// RUN: AMD_COMGR_CACHE_DIR=%t.cache spirv-translator %t.spv -o %t.translated.again.bc +// RUN: COUNT=$(ls "%t.cache" | wc -l) +// RUN: [ 2 -eq $COUNT ] + +// COM: Run again and check that the cache contents haven't changed +// RUN: AMD_COMGR_CACHE_DIR=%t.cache spirv-translator %t.spv -o %t.translated.again.bc +// RUN: COUNT=$(ls "%t.cache" | wc -l) +// RUN: [ 2 -eq $COUNT ] + +// COM: Dissasemble LLVM IR bitcode to LLVM IR text +// RUN: llvm-dis %t.translated.bc -o - | FileCheck %S/spirv-translator.cl diff --git a/amd/comgr/test-lit/spirv-translator.hip b/amd/comgr/test-lit/spirv-translator.hip index 50c37f8cbd1ea..a29e87ed486e0 100644 --- a/amd/comgr/test-lit/spirv-translator.hip +++ b/amd/comgr/test-lit/spirv-translator.hip @@ -1,4 +1,3 @@ -// XFAIL: * // REQUIRES: comgr-has-spirv // COM: Generate a SPIRV file from a HIP kernel // RUN: clang -x hip --offload-arch=amdgcnspirv -nogpulib -nogpuinc \ diff --git a/amd/comgr/test-lit/vfs-tests/lit.local.cfg b/amd/comgr/test-lit/vfs-tests/lit.local.cfg new file mode 100644 index 0000000000000..78283bc64f747 --- /dev/null +++ b/amd/comgr/test-lit/vfs-tests/lit.local.cfg @@ -0,0 +1,2 @@ +config.environment['AMD_COMGR_EMIT_VERBOSE_LOGS'] = "1" +config.environment['AMD_COMGR_REDIRECT_LOGS'] = "stdout" diff --git a/amd/comgr/test-lit/vfs-tests/vfs-tests.cl b/amd/comgr/test-lit/vfs-tests/vfs-tests.cl new file mode 100644 index 0000000000000..bc7c66af85211 --- /dev/null +++ b/amd/comgr/test-lit/vfs-tests/vfs-tests.cl @@ -0,0 +1,72 @@ +// COM: Prefixes follow pattern (AMD_COMGR_SAVETEMPS)-(AMD_COMGR_USE_VFS)-(DataAction API) + +// COM: Default behavior right now is to use the real file system +// RUN: source-to-bc-with-dev-libs %s -o %t-with-dev-libs.bc | FileCheck --check-prefixes=STATUS,OUT-NA-NA-NA %s + +// COM: AMD_COMGR_USE_VFS=1 should force the compiler to use VFS, irrespective of the option provided via the DataAction API +// RUN: env AMD_COMGR_USE_VFS=1 source-to-bc-with-dev-libs %s --novfs -o %t-with-dev-libs.bc | FileCheck --check-prefixes=STATUS,OUT-NA-VFS-NOVFS %s +// RUN: env AMD_COMGR_USE_VFS=1 source-to-bc-with-dev-libs %s -o %t-with-dev-libs.bc | FileCheck --check-prefixes=STATUS,OUT-NA-VFS-NA %s + +// COM: AMD_COMGR_USE_VFS=0 should force the compiler to not use VFS, irrespective of the option provided via the DataAction API +// RUN: env AMD_COMGR_USE_VFS=0 source-to-bc-with-dev-libs %s --vfs -o %t-with-dev-libs.bc | FileCheck --check-prefixes=STATUS,OUT-NA-NOVFS-VFS %s +// RUN: env AMD_COMGR_USE_VFS=0 source-to-bc-with-dev-libs %s -o %t-with-dev-libs.bc | FileCheck --check-prefixes=STATUS,OUT-NA-NOVFS-NA %s + +// COM: No value for AMD_COMGR_USE_VFS should respect option provided via the DataAction API +// RUN: source-to-bc-with-dev-libs %s --vfs -o %t-with-dev-libs.bc | FileCheck --check-prefixes=STATUS,OUT-NA-NA-VFS %s +// RUN: source-to-bc-with-dev-libs %s --novfs -o %t-with-dev-libs.bc | FileCheck --check-prefixes=STATUS,OUT-NA-NA-NOVFS %s + +// COM: AMD_COMGR_SAVE_TEMPS=1 should override all options and always use the real file system +// RUN: env AMD_COMGR_SAVE_TEMPS=1 source-to-bc-with-dev-libs %s --vfs -o %t-with-dev-libs.bc | FileCheck --check-prefixes=STATUS,OUT-SAVETEMPS-NA-VFS %s +// RUN: env AMD_COMGR_SAVE_TEMPS=1 AMD_COMGR_USE_VFS=1 source-to-bc-with-dev-libs %s -o %t-with-dev-libs.bc | FileCheck --check-prefixes=STATUS,OUT-SAVETEMPS-VFS-NA %s +// RUN: env AMD_COMGR_SAVE_TEMPS=1 AMD_COMGR_USE_VFS=1 source-to-bc-with-dev-libs %s --vfs -o %t-with-dev-libs.bc | FileCheck --check-prefixes=STATUS,OUT-SAVETEMPS-VFS-VFS %s + +// COM: Verify success of compilation for all scenarios +// STATUS: ReturnStatus: AMD_COMGR_STATUS_SUCCESS + +// OUT-NA-NA-NA: File System: VFS +// OUT-NA-VFS-NOVFS: File System: VFS +// OUT-NA-VFS-NA: File System: VFS +// OUT-NA-NOVFS-VFS: File System: Real +// OUT-NA-NOVFS-NA: File System: Real +// OUT-NA-NA-VFS: File System: VFS +// OUT-NA-NA-NOVFS: File System: Real +// OUT-SAVETEMPS-NA-VFS: File System: Real +// OUT-SAVETEMPS-VFS-VFS: File System: Real +// OUT-SAVETEMPS-VFS-NA: File System: Real + +extern const __constant bool __oclc_finite_only_opt; +extern const __constant bool __oclc_unsafe_math_opt; +extern const __constant bool __oclc_correctly_rounded_sqrt32; +extern const __constant bool __oclc_wavefrontsize64; +extern const __constant int __oclc_ISA_version; +extern const __constant int __oclc_ABI_version; + +void kernel device_libs(__global float *status) { + + if (__oclc_finite_only_opt) status[0] = 1.0; + if (__oclc_unsafe_math_opt) status[1] = 1.0; + if (__oclc_correctly_rounded_sqrt32) status[3] = 1.0; + if (__oclc_wavefrontsize64) status[4] = 1.0; + if (__oclc_ISA_version) status[5] = 1.0; + if (__oclc_ABI_version) status[6] = 1.0; + + // Math functions to test AMDGPULibCalls Folding optimizations + // fold_sincos() + float x = 0.25; + status[7] = sin(x) + cos(x); + status[8] = cos(x) + sin(x); + + // fold_rootn() + float y = 725.0; + status[9] = rootn(y, 3); + status[10] = rootn(y, -1); + status[11] = rootn(y, -2); + + // fold_pow() + float z = 12.16; + status[12] = pow(z, (float) 0.5); + status[13] = powr(y, (float) 7.23); + + // printf() + printf("testy\n"); +} diff --git a/amd/comgr/test/CMakeLists.txt b/amd/comgr/test/CMakeLists.txt index b4a4c5ac26550..b13ca16aab98e 100644 --- a/amd/comgr/test/CMakeLists.txt +++ b/amd/comgr/test/CMakeLists.txt @@ -193,9 +193,12 @@ endif() add_dependencies(check-comgr ${name}) # Windows binaries have no equivalent to RPATH, so we must set their PATH to # include the .lib/.dll directory. - if (NOT(UNIX)) + if (UNIX) set_tests_properties(${test_name} - PROPERTIES ENVIRONMENT "PATH=$") + PROPERTIES ENVIRONMENT "AMD_COMGR_CACHE=0;") + else() + set_tests_properties(${test_name} + PROPERTIES ENVIRONMENT "PATH=$;AMD_COMGR_CACHE=0;") endif() endmacro() @@ -218,6 +221,7 @@ add_comgr_test(compile_minimal_test c) add_comgr_test(compile_log_test c) add_comgr_test(compile_log_remarks_test c) add_comgr_test(compile_source_with_device_libs_to_bc_test c) +add_comgr_test(compile_source_with_device_libs_to_bc_with_vfs_test c) add_comgr_test(assemble_test c) add_comgr_test(link_test c) add_comgr_test(isa_name_parsing_test c) diff --git a/amd/comgr/test/compile_source_with_device_libs_to_bc_with_vfs_test.c b/amd/comgr/test/compile_source_with_device_libs_to_bc_with_vfs_test.c new file mode 100644 index 0000000000000..fe2e3a88eb972 --- /dev/null +++ b/amd/comgr/test/compile_source_with_device_libs_to_bc_with_vfs_test.c @@ -0,0 +1,170 @@ +//===- compile_source_with_device_libs_to_bc_with_vfs_test.c --------------===// +// +// Part of Comgr, under the Apache License v2.0 with LLVM Exceptions. See +// amd/comgr/LICENSE.TXT in this repository for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "amd_comgr.h" +#include "common.h" +#include +#include +#include + +int main(int argc, char *argv[]) { + char *BufSource; + size_t SizeSource; + amd_comgr_data_t DataSource; + amd_comgr_data_set_t DataSetIn, DataSetPch, DataSetBc, DataSetLinked, + DataSetReloc, DataSetExec; + amd_comgr_action_info_t DataAction; + amd_comgr_status_t Status; + const char *CodeGenOptions[] = {"-mcode-object-version=5", "-mllvm", + "-amdgpu-prelink"}; + size_t CodeGenOptionsCount = + sizeof(CodeGenOptions) / sizeof(CodeGenOptions[0]); + + SizeSource = setBuf(TEST_OBJ_DIR "/device_libs.cl", &BufSource); + + Status = amd_comgr_create_data_set(&DataSetIn); + checkError(Status, "amd_comgr_create_data_set"); + + Status = amd_comgr_create_data(AMD_COMGR_DATA_KIND_SOURCE, &DataSource); + checkError(Status, "amd_comgr_create_data"); + Status = amd_comgr_set_data(DataSource, SizeSource, BufSource); + checkError(Status, "amd_comgr_set_data"); + Status = amd_comgr_set_data_name(DataSource, "device_libs.cl"); + checkError(Status, "amd_comgr_set_data_name"); + Status = amd_comgr_data_set_add(DataSetIn, DataSource); + checkError(Status, "amd_comgr_data_set_add"); + + Status = amd_comgr_create_action_info(&DataAction); + checkError(Status, "amd_comgr_create_action_info"); + Status = amd_comgr_action_info_set_language(DataAction, + AMD_COMGR_LANGUAGE_OPENCL_1_2); + checkError(Status, "amd_comgr_action_info_set_language"); + Status = amd_comgr_action_info_set_isa_name(DataAction, + "amdgcn-amd-amdhsa--gfx900"); + checkError(Status, "amd_comgr_action_info_set_isa_name"); + // Set VFS knob to true + Status = amd_comgr_action_info_set_vfs(DataAction, true); + checkError(Status, "amd_comgr_action_info_set_vfs"); + + Status = amd_comgr_create_data_set(&DataSetPch); + checkError(Status, "amd_comgr_create_data_set"); + + Status = amd_comgr_do_action(AMD_COMGR_ACTION_ADD_PRECOMPILED_HEADERS, + DataAction, DataSetIn, DataSetPch); + checkError(Status, "amd_comgr_do_action"); + + size_t Count; + Status = amd_comgr_action_data_count( + DataSetPch, AMD_COMGR_DATA_KIND_PRECOMPILED_HEADER, &Count); + checkError(Status, "amd_comgr_action_data_count"); + + if (Count != 1) { + printf("AMD_COMGR_ACTION_ADD_PRECOMPILED_HEADERS Failed: " + "produced %zu precompiled header objects (expected 1)\n", + Count); + exit(1); + } + + Status = amd_comgr_create_data_set(&DataSetBc); + checkError(Status, "amd_comgr_create_data_set"); + + Status = amd_comgr_action_info_set_option_list(DataAction, CodeGenOptions, + CodeGenOptionsCount); + checkError(Status, "amd_comgr_action_info_set_option_list"); + + Status = amd_comgr_do_action( + AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC, DataAction, + DataSetPch, DataSetBc); + checkError(Status, "amd_comgr_do_action"); + + Status = + amd_comgr_action_data_count(DataSetBc, AMD_COMGR_DATA_KIND_BC, &Count); + checkError(Status, "amd_comgr_action_data_count"); + + if (Count != 1) { + printf("AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC Failed: " + "produced %zu BC objects (expected 1)\n", + Count); + exit(1); + } + + Status = amd_comgr_create_data_set(&DataSetLinked); + checkError(Status, "amd_comgr_create_data_set"); + + Status = amd_comgr_do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, DataAction, + DataSetBc, DataSetLinked); + checkError(Status, "amd_comgr_do_action"); + + Status = amd_comgr_action_data_count(DataSetLinked, AMD_COMGR_DATA_KIND_BC, + &Count); + checkError(Status, "amd_comgr_action_data_count"); + + if (Count != 1) { + printf("AMD_COMGR_ACTION_LINK_BC_TO_BC Failed: " + "produced %zu BC objects (expected 1)\n", + Count); + exit(1); + } + + Status = amd_comgr_create_data_set(&DataSetReloc); + checkError(Status, "amd_comgr_create_data_set"); + + Status = amd_comgr_do_action(AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE, + DataAction, DataSetLinked, DataSetReloc); + checkError(Status, "amd_comgr_do_action"); + + Status = amd_comgr_action_data_count(DataSetReloc, + AMD_COMGR_DATA_KIND_RELOCATABLE, &Count); + checkError(Status, "amd_comgr_action_data_count"); + + if (Count != 1) { + printf("AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE Failed: " + "produced %zu relocatable objects (expected 1)\n", + Count); + exit(1); + } + + Status = amd_comgr_create_data_set(&DataSetExec); + checkError(Status, "amd_comgr_create_data_set"); + + Status = amd_comgr_action_info_set_option_list(DataAction, NULL, 0); + checkError(Status, "amd_comgr_action_info_set_option_list"); + + Status = amd_comgr_do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, + DataAction, DataSetReloc, DataSetExec); + checkError(Status, "amd_comgr_do_action"); + + Status = amd_comgr_action_data_count(DataSetExec, + AMD_COMGR_DATA_KIND_EXECUTABLE, &Count); + checkError(Status, "amd_comgr_action_data_count"); + + if (Count != 1) { + printf("AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE Failed: " + "produced %zu executable objects (expected 1)\n", + Count); + exit(1); + } + + Status = amd_comgr_release_data(DataSource); + checkError(Status, "amd_comgr_release_data"); + Status = amd_comgr_destroy_data_set(DataSetIn); + checkError(Status, "amd_comgr_destroy_data_set"); + Status = amd_comgr_destroy_data_set(DataSetPch); + checkError(Status, "amd_comgr_destroy_data_set"); + Status = amd_comgr_destroy_data_set(DataSetBc); + checkError(Status, "amd_comgr_destroy_data_set"); + Status = amd_comgr_destroy_data_set(DataSetLinked); + checkError(Status, "amd_comgr_destroy_data_set"); + Status = amd_comgr_destroy_data_set(DataSetReloc); + checkError(Status, "amd_comgr_destroy_data_set"); + Status = amd_comgr_destroy_data_set(DataSetExec); + checkError(Status, "amd_comgr_destroy_data_set"); + Status = amd_comgr_destroy_action_info(DataAction); + checkError(Status, "amd_comgr_destroy_action_info"); + free(BufSource); +} diff --git a/amd/comgr/test/unbundle_hip_test.c b/amd/comgr/test/unbundle_hip_test.c index 3c20bcec3dbf2..8a9a3997faec0 100644 --- a/amd/comgr/test/unbundle_hip_test.c +++ b/amd/comgr/test/unbundle_hip_test.c @@ -182,11 +182,10 @@ int main(int Argc, char *Argv[]) { Status = amd_comgr_release_data(DataElement); checkError(Status, "amd_comgr_release_data"); - // TODO: Re-enable after finalizing LLVM PR #122629 - //if (BytesSize != 0) { - // printf("Bitcode host element size: %ld (expected 0)\n", BytesSize); - // exit(1); - //} + if (!BytesSize) { + printf("Bitcode host empty (expected non-empty)\n"); + exit(1); + } // bitcode hip-gfx900 element (non-empty) Status = amd_comgr_action_data_get_data( @@ -248,11 +247,10 @@ int main(int Argc, char *Argv[]) { Status = amd_comgr_release_data(DataElement); checkError(Status, "amd_comgr_release_data"); - // TODO: Re-enable after finalizing LLVM PR #122629 - //if (BytesSize != 0) { - // printf("Object host element size: %ld (expected empty)\n", BytesSize); - // exit(1); - //} + if (BytesSize != 0) { + printf("Object host element size: %ld (expected empty)\n", BytesSize); + exit(1); + } // object hip-gfx900 element (non-empty) Status = amd_comgr_action_data_get_data( @@ -314,8 +312,8 @@ int main(int Argc, char *Argv[]) { Status = amd_comgr_release_data(DataElement); checkError(Status, "amd_comgr_release_data"); - if (BytesSize != 8) { - printf("Arvhive host element size: %ld (expected 8)\n", BytesSize); + if (!BytesSize) { + printf("Arvhive host empty (expected non-empty)\n"); exit(1); } diff --git a/amd/device-libs/asanrtl/src/dm.cl b/amd/device-libs/asanrtl/src/dm.cl index bb3e627665eaf..5cd6b1580d9ec 100644 --- a/amd/device-libs/asanrtl/src/dm.cl +++ b/amd/device-libs/asanrtl/src/dm.cl @@ -9,6 +9,9 @@ #include "asan_util.h" #include "shadow_mapping.h" +#define OPTNONE __attribute__((optnone)) + +static const __constant uchar kAsanHeapLeftRedzoneMagic = (uchar)0xfa; static const __constant uint kAsanHeapLeftRedzoneMagicx4 = 0xfafafafaU; static const __constant ulong kAsanHeapLeftRedzoneMagicx8 = 0xfafafafafafafafaUL; static const __constant uchar kAsanHeapFreeMagic = (uchar)0xfd; @@ -18,14 +21,20 @@ extern ulong __ockl_devmem_request(ulong addr, ulong size); // Whether we track non-slab allocations #define NON_SLAB_TRACKING 1 +// Whether we add ID to slabs +#define SLAB_IDENTITY 1 + // Magic at beginning of allocation #define ALLOC_MAGIC 0xfedcba1ee1abcdefUL #define AS(P,V) __opencl_atomic_store(P, V, memory_order_relaxed, memory_scope_device) #define AL(P) __opencl_atomic_load(P, memory_order_relaxed, memory_scope_device) #define AA(P,V) __opencl_atomic_fetch_add(P, V, memory_order_relaxed, memory_scope_device) +#define AN(P,V) __opencl_atomic_fetch_and(P, V, memory_order_relaxed, memory_scope_device) #define AO(P,V) __opencl_atomic_fetch_or(P, V, memory_order_relaxed, memory_scope_device) #define ACE(P,E,V) __opencl_atomic_compare_exchange_strong(P, E, V, memory_order_relaxed, memory_order_relaxed, memory_scope_device) +#define RF() __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent", "global") +#define ARF() __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent", "global") // An allocation #define ALLOC_HEADER_BYTES 32 @@ -41,7 +50,7 @@ typedef struct alloc_struct { // Assumes 4096 byte minimum alignment of slab #define SLAB_ALIGN 4096 #define SLAB_BUSY ((__global slab_t *)1UL) -#define SLAB_TICKS 20000 +#define SLAB_TICKS 100000 #define SLAB_BYTES (1UL << 21) #define SLAB_THRESHOLD (SLAB_BYTES / 64) #define SLAB_HEADER_BYTES 32 @@ -53,16 +62,18 @@ typedef struct alloc_struct { #define LINE 128 #define PAD(N,M) ulong pad##N[LINE/8 - M]; -#define F_POISON_START 0x01 -#define F_POISON_DONE 0x02 +#define F_POISON_NEEDED 0x01 +#define F_POISON_PENDING 0x02 +#define F_UNREADY 0x04 +#define F_MASK (F_POISON_NEEDED | F_POISON_PENDING | F_UNREADY) // A slab of memory used to provide malloc returned blocks typedef struct slab_s { atomic_ulong next; // link to next slab on queue chain, must be first - atomic_ulong ap; // Pointer to next allocation (>= &space[0] ) + atomic_ulong ap; // Pointer to next allocation and flags atomic_uint rb; // returned bytes - atomic_uint flags; // flags - ulong pad; + uint pad; + atomic_ulong sid; // slab ID ulong space[(SLAB_BYTES-SLAB_HEADER_BYTES)/8]; // Space for allocations. Must be aligned 16 } slab_t; @@ -93,16 +104,14 @@ typedef struct heap_s { #if defined NON_SLAB_TRACKING atomic_ulong num_nonslab_allocations; // Count of number of non-slab allocations that have not been freed PAD(5,1); +#endif +#if defined SLAB_IDENTITY + atomic_ulong num_slab_allocations; // Count of total slabs allocated + PAD(6,1); #endif lifo_t la[NLA]; // Storage for available slabs } heap_t; -// Inhibit control flow optimizations -#define O0(X) X = o0(X) -__attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; } -__attribute__((overloadable)) static uint o0(uint x) { uint y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; } -__attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; } - // Overloads to broadcast the value held by the first active lane // The result is known to be wave-uniform static __attribute__((overloadable)) uint @@ -181,18 +190,19 @@ added_redzone(uint sz) static void slab_pause(void) { - __builtin_amdgcn_s_sleep(3); + __builtin_amdgcn_s_sleep(9); } + // Intended to be called from only one lane of a wave -__attribute__((optnone)) +OPTNONE NO_SANITIZE_ADDR static void put_free_slab(__global heap_t *hp, __global slab_t *sp) { __global lifo_t *lp = LP(hp, AA(&hp->wid, 1UL)); - for (ulong i=1;;++i) { + for (;;) { ulong top = AL(&lp->top); AS(&sp->next, (ulong)slabptr(top)); if (ACE(&lp->top, &top, addcnt((ulong)sp, top))) { @@ -203,7 +213,6 @@ put_free_slab(__global heap_t *hp, __global slab_t *sp) } // Intended to be called from only one lane of a wave -__attribute__((optnone)) NO_SANITIZE_ADDR static __global slab_t * get_free_slab(__global heap_t *hp) @@ -213,47 +222,41 @@ get_free_slab(__global heap_t *hp) __global lifo_t *lp = LP(hp, AA(&hp->rid, 1UL)); - for (ulong i=1;;++i) { + for (;;) { ulong top = AL(&lp->top); __global slab_t *sp = slabptr(top); if (sp) { ulong next = AL(&sp->next); - if (ACE(&lp->top, &top, addcnt(next, top))) { + if (ACE(&lp->top, &top, addcnt(next, top))) return sp; - } } else { return 0; } slab_pause(); } -} -// reset slab, called by a single workitem -NO_SANITIZE_ADDR -static void -reset_slab(__global slab_t *sp) -{ - AS(&sp->ap, (ulong)sp + SLAB_HEADER_BYTES); - AS(&sp->rb, 0U); } NO_SANITIZE_ADDR static void -poison_allocation(__global alloc_t *ap, uint sz) +ready_slab(__global slab_t *sp) { - __global uchar *asp = (__global uchar *)MEM_TO_SHADOW((ulong)ap) + ALLOC_HEADER_BYTES / SHADOW_GRANULARITY; - for (uint i = 0; i < (sz + SHADOW_GRANULARITY - 1) / SHADOW_GRANULARITY; ++i) - asp[i] = kAsanHeapFreeMagic; - - __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); + AS(&sp->rb, 0U); + if (!(AL(&sp->ap) & (ulong)(F_POISON_PENDING | F_POISON_NEEDED))) { + AS(&sp->ap, (ulong)sp + SLAB_HEADER_BYTES); + } else { + AN(&sp->ap, ~(ulong)F_UNREADY); + } } NO_SANITIZE_ADDR static void unpublish_allocation(__global alloc_t *ap, ulong pc) { + uint arz = ap->asz - ALLOC_HEADER_BYTES - round_16(ap->usz); + __global uchar *s = (__global uchar *)MEM_TO_SHADOW((ulong)ap - arz); + __builtin_memset(s, kAsanHeapFreeMagic, ap->asz / SHADOW_GRANULARITY); ap->pc = pc; - poison_allocation(ap, ap->usz); } // Free a slab based allocation @@ -264,9 +267,9 @@ slab_free(__global alloc_t *ap, ulong pc) unpublish_allocation(ap, pc); __global heap_t *hp = get_heap_ptr(); __global slab_t *sp = (__global slab_t *)ap->sp; + int go = 1; do { - O0(go); if (go) { if (sp == first(sp)) { uint sz = __ockl_alisa_u32(ap->asz); @@ -274,10 +277,6 @@ slab_free(__global alloc_t *ap, ulong pc) if (aid == 0) { uint rb = AA(&sp->rb, sz) + sz; if (rb == SLAB_BYTES - SLAB_HEADER_BYTES) { - ulong cs = AL(&hp->cs); - if ((ulong)sp == cs) { - ACE(&hp->cs, &cs, 0UL); - } put_free_slab(hp, sp); } } @@ -318,6 +317,8 @@ __asan_free_impl(ulong aa, ulong pc) pc -= CALL_BYTES; + ARF(); + uptr sa = MEM_TO_SHADOW(aa); s8 sb = *(__global s8*) sa; if (sb != 0 && ((s8)(aa & (SHADOW_GRANULARITY-1)) >= sb)) { @@ -329,6 +330,8 @@ __asan_free_impl(ulong aa, ulong pc) slab_free(ap, pc); else non_slab_free(ap, pc); + + ARF(); } // Non-slab based allocation (when size is above threshold) @@ -400,9 +403,13 @@ try_new_slab(__global heap_t *hp) __global slab_t *sp = obtain_new_slab(hp); if (sp) { AS(&sp->next, 0UL); - AS(&sp->ap, (ulong)sp->space); AS(&sp->rb, 0U); - AS(&sp->flags, 0U); + AS(&sp->ap, (ulong)sp + (ulong)SLAB_HEADER_BYTES + (ulong)(F_UNREADY | F_POISON_PENDING | F_POISON_NEEDED)); +#if defined SLAB_IDENTITY + AS(&sp->sid, AA(&hp->num_slab_allocations, 1UL)); +#else + AS(&sp->sid, 0UL); +#endif } return sp; } @@ -420,12 +427,12 @@ new_slab_wait(__global heap_t *hp) } // Called by a single workitem -__attribute__((optnone)) +OPTNONE NO_SANITIZE_ADDR static __global slab_t * get_current_slab(__global heap_t *hp) { - for (ulong i=1;;++i) { + for (;;) { ulong cs = AL(&hp->cs); if (cs) return (__global slab_t *)cs; @@ -444,19 +451,22 @@ get_current_slab(__global heap_t *hp) __global slab_t *fs = get_free_slab(hp); if (fs) { - reset_slab(fs); - if (ACE(&hp->cs, &cs, (ulong)fs)) + if (ACE(&hp->cs, &cs, (ulong)fs)) { + ready_slab(fs); return fs; + } put_free_slab(hp, fs); - return (__global slab_t *)cs; + continue; } __global slab_t *ns = try_new_slab(hp); if ((ulong)ns > (ulong)SLAB_BUSY) { - if (ACE(&hp->cs, &cs, (ulong)ns)) + if (ACE(&hp->cs, &cs, (ulong)ns)) { + AN(&ns->ap, ~(ulong)F_UNREADY); return ns; + } put_free_slab(hp, ns); - return (__global slab_t *)cs; + continue; } if (!ns) @@ -474,46 +484,33 @@ poison_slab(__global slab_t *sp, int aid, int na) for (int i=aid; i < SLAB_BYTES / SHADOW_GRANULARITY / sizeof(ulong); i += na) ssp[i] = kAsanHeapLeftRedzoneMagicx8; - - __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); + RF(); if (!aid) - AO(&sp->flags, F_POISON_DONE); + AN(&sp->ap, ~(ulong)F_POISON_PENDING); } NO_SANITIZE_ADDR -static void -poison_slab_wait(__global slab_t *sp) -{ - while ((AL(&sp->flags) & F_POISON_DONE) == 0U) - slab_pause(); -} - -NO_SANITIZE_ADDR -static void -unpoison_allocation(__global alloc_t *ap, uint sz) +static ulong +publish_allocation(ulong ap, ulong sp, ulong pc, uint asz, uint arz, uint usz) { - __global uchar *asp = (__global uchar *)MEM_TO_SHADOW((ulong)ap) + ALLOC_HEADER_BYTES / SHADOW_GRANULARITY; - for (uint i = 0; i < sz / SHADOW_GRANULARITY; ++i) - asp[i] = (uchar)0; + __global uchar *s = (__global uchar *)MEM_TO_SHADOW(ap); - if (sz % SHADOW_GRANULARITY) - asp[sz / SHADOW_GRANULARITY] = (uchar)(sz % SHADOW_GRANULARITY); + __builtin_memset(s, kAsanHeapLeftRedzoneMagic, (arz + ALLOC_HEADER_BYTES) / SHADOW_GRANULARITY); - __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); -} + s += (arz + ALLOC_HEADER_BYTES) / SHADOW_GRANULARITY; + __builtin_memset(s, 0, usz / SHADOW_GRANULARITY); + if (usz % SHADOW_GRANULARITY) + s[usz / SHADOW_GRANULARITY] = (uchar)(usz % SHADOW_GRANULARITY); -NO_SANITIZE_ADDR -static void -publish_allocation(__global alloc_t *ap, ulong sp, ulong pc, uint asz, uint usz) -{ - ap->magic = ALLOC_MAGIC; - ap->pc = pc; - ap->sp = sp; - ap->asz = asz; - ap->usz = usz; + __global alloc_t *a = (__global alloc_t *)(ap + arz); + a->magic = ALLOC_MAGIC; + a->sp = sp; + a->pc = pc; + a->asz = asz; + a->usz = usz; - unpoison_allocation(ap, usz); + return ap + arz + ALLOC_HEADER_BYTES; } // slab based malloc @@ -530,7 +527,6 @@ slab_malloc(ulong lsz, ulong pc) int go = 1; do { if (go) { - O0(go); uint aid = __ockl_activelane_u32(); __global slab_t *cs = (__global slab_t *)0; @@ -543,46 +539,54 @@ slab_malloc(ulong lsz, ulong pc) continue; } - uint f = 0U; - if (!aid) { - f = AO(&cs->flags, F_POISON_START); - } - f = first(f); - if ((f & F_POISON_START) == 0) { - poison_slab(cs, aid, active_lane_count()); - } else if ((f & F_POISON_DONE) == 0) { - if (!aid) - poison_slab_wait(cs); - } + ulong o = (ulong)__ockl_alisa_u32(asz); - uint o = __ockl_alisa_u32(asz); - - ulong p = 0UL; + ulong ap = 0; if (!aid) - p = AA(&cs->ap, o); - p = first(p); + ap = AL(&cs->ap); + ap = first(ap); - if (p + o <= (ulong)cs + SLAB_BYTES) { - __global alloc_t *ap = (__global alloc_t *)(p + o - asz + arz); - publish_allocation(ap, (ulong)cs, pc, asz, usz); - ret = (ulong)ap + ALLOC_HEADER_BYTES; - go = 0; - } else { - if (!__ockl_activelane_u32()) { - ulong e = (ulong)cs; - ACE(&hp->cs, &e, 0UL); - } - if (p + o - asz < (ulong)cs + SLAB_BYTES) { - uint unused = (uint)((ulong)cs + SLAB_BYTES - (p + o - asz)); - uint rb = AA(&cs->rb, unused) + unused; + if (ap & (ulong)F_MASK) { + ulong p = 0; + if (!aid) + p = AN(&cs->ap, ~(ulong)F_POISON_NEEDED); + p = first(p); - if (rb == SLAB_BYTES - SLAB_HEADER_BYTES) - put_free_slab(hp, cs); - } + if (p & (ulong)F_POISON_NEEDED) + poison_slab(cs, aid, active_lane_count()); + else + slab_pause(); + } else { + ulong p = 0; + if (!aid) + p = AA(&cs->ap, o); + p = first(p); + + if (!(p & (ulong)F_MASK)) { + if (p + o <= (ulong)cs + SLAB_BYTES) { + ret = publish_allocation(p + o - asz, (ulong)cs, pc, asz, arz, usz); + go = 0; + } else { + if (!__ockl_activelane_u32()) { + ulong e = (ulong)cs; + ACE(&hp->cs, &e, 0UL); + AO(&cs->ap, (ulong)F_UNREADY); + } + if (p + o - asz < (ulong)cs + SLAB_BYTES) { + uint unused = (uint)((ulong)cs + SLAB_BYTES - (p + o - asz)); + uint rb = AA(&cs->rb, unused) + unused; + if (rb == SLAB_BYTES - SLAB_HEADER_BYTES) { + put_free_slab(hp, cs); + } + } + } + } else + slab_pause(); } } } while (__ockl_wfany_i32(go)); + return ret; } @@ -595,10 +599,17 @@ __asan_malloc_impl(ulong sz, ulong pc) { pc -= CALL_BYTES; + ARF(); + + ulong ret; if (sz > SLAB_THRESHOLD) - return non_slab_malloc(sz, pc); + ret = non_slab_malloc(sz, pc); else - return slab_malloc(sz, pc); + ret = slab_malloc(sz, pc); + + ARF(); + + return ret; } // This initialization assumes a one-workgroup grid with 256 work items, @@ -630,6 +641,9 @@ __ockl_dm_init_v1(ulong ha, ulong sa, uint hb, uint nis) hp->initial_slabs_end = sa + ((ulong)nis << 21); #if defined NON_SLAB_TRACKING AS(&hp->num_nonslab_allocations, 0UL); +#endif +#if defined SLAB_IDENTITY + AS(&hp->num_slab_allocations, 0UL); #endif } diff --git a/amd/device-libs/ockl/src/dm.cl b/amd/device-libs/ockl/src/dm.cl index 18efc54203b7e..26a27f540f186 100644 --- a/amd/device-libs/ockl/src/dm.cl +++ b/amd/device-libs/ockl/src/dm.cl @@ -181,6 +181,8 @@ typedef struct heap_s { #define AFO(P, V, O) __opencl_atomic_fetch_or (P, V, O, memory_scope_device) #define ACE(P, E, V, O) __opencl_atomic_compare_exchange_strong(P, E, V, O, O, memory_scope_device) +#define NEED_RELEASE __oclc_ISA_version >= 9400 && __oclc_ISA_version < 10000 + // get the heap pointer static __global heap_t * get_heap_ptr(void) { @@ -385,6 +387,10 @@ __ockl_dm_dealloc(ulong addr) return; } + if (NEED_RELEASE) { + __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent", "global"); + } + // Find a slab block ulong saddr = addr & ~(ulong)0x1fffffUL; __global slab_t *sptr = (__global slab_t *)saddr; @@ -944,6 +950,9 @@ __ockl_dm_init_v1(ulong hp, ulong sp, uint hb, uint nis) } } + __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent", "global"); + __builtin_amdgcn_s_barrier(); + if (lid == 0) { __global heap_t *thp = (__global heap_t *)hp; AS(&thp->initial_slabs, sp, memory_order_relaxed); diff --git a/amd/device-libs/ocml/src/tgammaF.cl b/amd/device-libs/ocml/src/tgammaF.cl index 6aa2a377e9321..e0f02b0b7bfda 100644 --- a/amd/device-libs/ocml/src/tgammaF.cl +++ b/amd/device-libs/ocml/src/tgammaF.cl @@ -19,7 +19,7 @@ MATH_MANGLE(tgamma)(float x) if (ax > 0x1.0p-6f) { // For x < 3, push to larger value using gamma(x) = gamma(x+1) / x float d = 1.0f; - if (x < 1.0f) { + if (ax < 1.0f) { d = MATH_MAD((ax + 3.0f), ax, 2.0f) * ax; ax = ax + 3.0f; } else if (ax < 2.0f) { @@ -42,9 +42,16 @@ MATH_MANGLE(tgamma)(float x) ret = x > 0x1.18521ep+5f ? PINF_F32 : ret; } else { float s = MATH_MANGLE(sinpi)(x); - float p = s*x*t2*t1*t1; - ret = MATH_DIV(-sqrtpiby2*d, MATH_MAD(p, pt, p)); - ret = x < -42.0f ? 0.0f : ret; + if (x > -30.0f) { + float p = s*x*t2*t1*t1; + ret = MATH_DIV(-sqrtpiby2*d, MATH_MAD(p, pt, p)); + } else if (x > -41.0f) { + float t3 = t2*t1; + float p1 = MATH_MAD(t3, pt, t3); + float p2 = s*x*t1; + ret = MATH_DIV(MATH_DIV(-sqrtpiby2*d, p1), p2); + } else + ret = 0.0f; ret = BUILTIN_FRACTION_F32(x) == 0.0f ? QNAN_F32 : ret; } } else { diff --git a/amd/device-libs/opencl/src/devenq/schedule_rocm.cl b/amd/device-libs/opencl/src/devenq/schedule_rocm.cl index 731da8853e72c..209eebeebc02f 100644 --- a/amd/device-libs/opencl/src/devenq/schedule_rocm.cl +++ b/amd/device-libs/opencl/src/devenq/schedule_rocm.cl @@ -61,16 +61,23 @@ min_command(uint slot_num, __global AmdAqlWrap* wraps) return minCommand; } +static inline bool +check_pcie_support(__global SchedulerParam* param) { + #define kInvalidWriteIndex (ulong)(-1) + return (param->write_index == kInvalidWriteIndex) ? true : false; +} + static inline void EnqueueDispatch(__global hsa_kernel_dispatch_packet_t* aqlPkt, __global SchedulerParam* param) { __global hsa_queue_t* child_queue = param->child_queue; - - // ulong index = __ockl_hsa_queue_add_write_index(child_queue, 1, __ockl_memory_order_relaxed); - // The original code seen above relies on PCIe 3 atomics, which might not be supported on some systems, so use a device side global - // for workaround. - ulong index = atomic_fetch_add_explicit((__global atomic_ulong*)¶m->write_index, (ulong)1, memory_order_relaxed, memory_scope_device); + ulong index; + if (check_pcie_support(param)) { + index = __ockl_hsa_queue_add_write_index(child_queue, 1, __ockl_memory_order_relaxed); + } else { + index = atomic_fetch_add_explicit((__global atomic_ulong*)¶m->write_index, (ulong)1, memory_order_relaxed, memory_scope_device); + } const ulong queueMask = child_queue->size - 1; __global hsa_kernel_dispatch_packet_t* dispatch_packet = &(((__global hsa_kernel_dispatch_packet_t*)(child_queue->base_address))[index & queueMask]); @@ -82,17 +89,20 @@ EnqueueScheduler(__global SchedulerParam* param) { __global hsa_queue_t* child_queue = param->child_queue; - // ulong index = __ockl_hsa_queue_add_write_index(child_queue, 1, __ockl_memory_order_relaxed); - // The original code seen above relies on PCIe 3 atomics, which might not be supported on some systems, so use a device side global - // for workaround. - ulong index = atomic_fetch_add_explicit((__global atomic_ulong*)¶m->write_index, (ulong)1, memory_order_relaxed, memory_scope_device); + ulong index; + if (check_pcie_support(param)) { + index = __ockl_hsa_queue_add_write_index(child_queue, 1, __ockl_memory_order_relaxed); + } else { + index = atomic_fetch_add_explicit((__global atomic_ulong*)¶m->write_index, (ulong)1, memory_order_relaxed, memory_scope_device); + } const ulong queueMask = child_queue->size - 1; __global hsa_kernel_dispatch_packet_t* dispatch_packet = &(((__global hsa_kernel_dispatch_packet_t*)(child_queue->base_address))[index & queueMask]); *dispatch_packet = param->scheduler_aql; - // This is part of the PCIe 3 atomics workaround, to write the final write_index value back to the child_queue - __ockl_hsa_queue_store_write_index(child_queue, index + 1, __ockl_memory_order_relaxed); + if (!check_pcie_support(param)) { + __ockl_hsa_queue_store_write_index(child_queue, index + 1, __ockl_memory_order_relaxed); + } __ockl_hsa_signal_store(child_queue->doorbell_signal, index, __ockl_memory_order_release); } diff --git a/amd/hipcc/CMakeLists.txt b/amd/hipcc/CMakeLists.txt index 53f472c66e451..8324372bb3e2e 100755 --- a/amd/hipcc/CMakeLists.txt +++ b/amd/hipcc/CMakeLists.txt @@ -69,7 +69,6 @@ set(CPACK_DEBIAN_ENABLE_COMPONENT_DEPENDS ON) set(CPACK_DEB_COMPONENT_INSTALL ON) set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core") -set(CPACK_DEBIAN_PACKAGE_RECOMMENDS "perl (>= 5.0)") set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/ROCm-Developer-Tools/HIPCC") if(DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE}) set(CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE}) @@ -80,19 +79,16 @@ endif() # AMD specific Debian packaging varibles. set(CPACK_DEBIAN_AMD_PACKAGE_NAME "hipcc") set(CPACK_DEBIAN_AMD_PACKAGE_DEPENDS "rocm-core, rocm-llvm") -set(CPACK_DEBIAN_AMD_PACKAGE_RECOMMENDS "perl (>= 5.0)") # NVIDIA specific Debian packaging variables. set(CPACK_DEBIAN_NVIDIA_PACKAGE_NAME "hipcc-nvidia") set(CPACK_DEBIAN_NVIDIA_PACKAGE_DEPENDS "rocm-core") # for NVIDIA we don't need to add rocm-llvm as a dependency -set(CPACK_DEBIAN_NVIDIA_PACKAGE_RECOMMENDS "perl (>= 5.0)") # RPM specific packaging variables. set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") set(CPACK_RPM_PACKAGE_LICENSE "MIT") set(CPACK_RPM_PACKAGE_REQUIRES "rocm-core") -set(CPACK_RPM_PACKAGE_SUGGESTS "perl >= 5.0") set(CPACK_RPM_PACKAGE_AUTOREQPROV 0) if(DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE}) set(CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE}) @@ -106,12 +102,10 @@ endif() # AMD specific RPM packaging varibables. set(CPACK_RPM_AMD_PACKAGE_NAME "hipcc") set(CPACK_RPM_AMD_PACKAGE_REQUIRES "rocm-core, rocm-llvm") -set(CPACK_RPM_AMD_PACKAGE_SUGGESTS "perl >= 5.0") # NVIDIA specific RPM packaging variables. set(CPACK_RPM_NVIDIA_PACKAGE_NAME "hipcc-nvidia") set(CPACK_RPM_NVIDIA_PACKAGE_REQUIRES "rocm-core") # for NVIDIA we don't need to add rocm-llvm as a dependency -set(CPACK_RPM_NVIDIA_PACKAGE_SUGGESTS "perl >= 5.0") # ROCM versioning. set(ROCM_VERSION_FOR_PACKAGE "") diff --git a/amd/hipcc/bin/hipcc.pl b/amd/hipcc/bin/hipcc.pl deleted file mode 100755 index b9d107f7f0c04..0000000000000 --- a/amd/hipcc/bin/hipcc.pl +++ /dev/null @@ -1,625 +0,0 @@ -#!/usr/bin/env perl -# Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -# Need perl > 5.10 to use logic-defined or -use 5.006; use v5.10.1; -use warnings; -use File::Basename; -use File::Temp qw/ :mktemp /; -use Cwd; -use Cwd 'abs_path'; - -# HIP compiler driver -# Will call clang or nvcc (depending on target) and pass the appropriate include and library options for -# the target compiler and HIP infrastructure. - -# Will pass-through options to the target compiler. The tools calling HIPCC must ensure the compiler -# options are appropriate for the target compiler. - -# Environment variable HIP_PLATFORM is to detect amd/nvidia path: -# HIP_PLATFORM='nvidia' or HIP_PLATFORM='amd'. -# If HIP_PLATFORM is not set hipcc will attempt auto-detect based on if nvcc is found. -# -# Other environment variable controls: -# HIP_PATH : Path to HIP directory, default is one dir level above location of this script. -# CUDA_PATH : Path to CUDA SDK (default /usr/local/cuda). Used on NVIDIA platforms only. -# HIP_ROCCLR_HOME : Path to HIP/ROCclr directory. Used on AMD platforms only. -# HIP_CLANG_PATH : Path to HIP-Clang (default to ../../llvm/bin relative to this -# script's abs_path). Used on AMD platforms only. - -if(scalar @ARGV == 0){ - print "No Arguments passed, exiting ...\n"; - exit(-1); -} - -# retrieve --rocm-path hipcc option from command line. -# We need to respect this over the env var ROCM_PATH for this compilation. -sub get_path_options { - my $rocm_path=""; - my $hip_path=""; - my @CLArgs = @ARGV; - foreach $arg (@CLArgs) { - if (index($arg,"--rocm-path=") != -1) { - ($rocm_path) = $arg=~ /=\s*(.*)\s*$/; - next; - } - if (index($arg,"--hip-path=") != -1) { - ($hip_path) = $arg=~ /=\s*(.*)\s*$/; - next; - } - } - return ($rocm_path, $hip_path); -} - -$verbose = $ENV{'HIPCC_VERBOSE'} // 0; -# Verbose: 0x1=commands, 0x2=paths, 0x4=hipcc args - -$HIPCC_COMPILE_FLAGS_APPEND=$ENV{'HIPCC_COMPILE_FLAGS_APPEND'}; -$HIPCC_LINK_FLAGS_APPEND=$ENV{'HIPCC_LINK_FLAGS_APPEND'}; - -# Known Features -@knownFeatures = ('sramecc-', 'sramecc+', 'xnack-', 'xnack+'); - -$HIP_LIB_PATH=$ENV{'HIP_LIB_PATH'}; -$DEVICE_LIB_PATH=$ENV{'DEVICE_LIB_PATH'}; -$HIP_CLANG_HCC_COMPAT_MODE=$ENV{'HIP_CLANG_HCC_COMPAT_MODE'}; # HCC compatibility mode -$HIP_COMPILE_CXX_AS_HIP=$ENV{'HIP_COMPILE_CXX_AS_HIP'} // "1"; - -my $base_dir; -BEGIN { - $base_dir = dirname(Cwd::realpath(__FILE__) ); - my ($rocm_path, $hip_path) = get_path_options(); - if ($rocm_path ne '') { - # --rocm-path takes precedence over ENV{ROCM_PATH} - $ENV{ROCM_PATH}=$rocm_path; - } - if ($hip_path ne '') { - # --rocm-path takes precedence over ENV{ROCM_PATH} - $ENV{HIP_PATH}=$hip_path; - } -} -use lib "$base_dir/"; - -use hipvars; -$isWindows = $hipvars::isWindows; -$doubleQuote = $hipvars::doubleQuote; -$HIP_RUNTIME = $hipvars::HIP_RUNTIME; -$HIP_PLATFORM = $hipvars::HIP_PLATFORM; -$HIP_COMPILER = $hipvars::HIP_COMPILER; -$HIP_CLANG_PATH = $hipvars::HIP_CLANG_PATH; -$CUDA_PATH = $hipvars::CUDA_PATH; -$HIP_PATH = $hipvars::HIP_PATH; -$ROCM_PATH = $hipvars::ROCM_PATH; -$HIP_VERSION = $hipvars::HIP_VERSION; -$HIP_ROCCLR_HOME = $hipvars::HIP_ROCCLR_HOME; - -sub get_normalized_path { - return $doubleQuote . $_[0] . $doubleQuote; -} - -if ($HIP_PLATFORM eq "amd") { - $HIP_INCLUDE_PATH = "$HIP_ROCCLR_HOME/include"; - if (!defined $HIP_LIB_PATH) { - $HIP_LIB_PATH = "$HIP_ROCCLR_HOME/lib"; - } -} - -if ($verbose & 0x2) { - print ("HIP_PATH=$HIP_PATH\n"); - print ("HIP_PLATFORM=$HIP_PLATFORM\n"); - print ("HIP_COMPILER=$HIP_COMPILER\n"); - print ("HIP_RUNTIME=$HIP_RUNTIME\n"); -} - -# set if user explicitly requests -stdlib=libc++. (else we default to libstdc++ for better interop with g++): -$setStdLib = 0; # TODO - set to 0 - -$default_amdgpu_target = 1; - -if ($HIP_PLATFORM eq "amd") { - $execExtension = ""; - if($isWindows) { - $execExtension = ".exe"; - } - $HIPCC=get_normalized_path("$HIP_CLANG_PATH/clang++" . $execExtension); - - # If $HIPCC clang++ is not compiled, use clang instead - if ( ! -e $HIPCC ) { - $HIPCC=get_normalized_path("$HIP_CLANG_PATH/clang" . $execExtension); - $HIPLDFLAGS = "--driver-mode=g++"; - } - # to avoid using dk linker or MSVC linker - if($isWindows) { - $HIPLDFLAGS .= " -fuse-ld=lld"; - $HIPLDFLAGS .= " --ld-path=" . get_normalized_path("$HIP_CLANG_PATH/lld-link.exe"); - } - - # get Clang RT Builtin path - $HIP_CLANG_RT_LIB = `$HIPCC --print-runtime-dir`; - chomp($HIP_CLANG_RT_LIB); - - if (! defined $HIP_INCLUDE_PATH) { - $HIP_INCLUDE_PATH = "$HIP_PATH/include"; - } - if (! defined $HIP_LIB_PATH) { - $HIP_LIB_PATH = "$HIP_PATH/lib"; - } - if ($verbose & 0x2) { - print ("ROCM_PATH=$ROCM_PATH\n"); - if (defined $HIP_ROCCLR_HOME) { - print ("HIP_ROCCLR_HOME=$HIP_ROCCLR_HOME\n"); - } - print ("HIP_CLANG_PATH=$HIP_CLANG_PATH\n"); - print ("HIP_INCLUDE_PATH=$HIP_INCLUDE_PATH\n"); - print ("HIP_LIB_PATH=$HIP_LIB_PATH\n"); - print ("DEVICE_LIB_PATH=$DEVICE_LIB_PATH\n"); - print ("HIP_CLANG_RT_LIB=$HIP_CLANG_RT_LIB\n"); - } - - if ($HIP_CLANG_HCC_COMPAT_MODE) { - ## Allow __fp16 as function parameter and return type. - $HIPCXXFLAGS .= " -Xclang -fallow-half-arguments-and-returns -D__HIP_HCC_COMPAT_MODE__=1"; - } -} elsif ($HIP_PLATFORM eq "nvidia") { - $CUDA_PATH=$ENV{'CUDA_PATH'} // '/usr/local/cuda'; - if ($verbose & 0x2) { - print ("CUDA_PATH=$CUDA_PATH\n"); - } - - $HIPCC=get_normalized_path("$CUDA_PATH/bin/nvcc"); - $HIPCXXFLAGS .= " -Wno-deprecated-gpu-targets "; - - $HIPLDFLAGS = " -Wno-deprecated-gpu-targets -lcuda -lcudart -L" . get_normalized_path("$CUDA_PATH/lib64"); -} else { - printf ("error: unknown HIP_PLATFORM = '$HIP_PLATFORM'"); - printf (" or HIP_COMPILER = '$HIP_COMPILER'"); - exit (-1); -} - -my $compileOnly = 0; -my $needCXXFLAGS = 0; # need to add CXX flags to compile step -my $needCFLAGS = 0; # need to add C flags to compile step -my $needLDFLAGS = 1; # need to add LDFLAGS to compile step. -my $fileTypeFlag = 0; # to see if -x flag is mentioned -my $hasOMPTargets = 0; # If OMP targets is mentioned -my $hasC = 0; # options contain a c-style file -my $hasCXX = 0; # options contain a cpp-style file (NVCC must force recognition as GPU file) -my $hasHIP = 0; # options contain a hip-style file (HIP-Clang must pass offloading options) -my $printHipVersion = 0; # print HIP version -my $printCXXFlags = 0; # print HIPCXXFLAGS -my $printLDFlags = 0; # print HIPLDFLAGS -my $runCmd = 1; -my $buildDeps = 0; -my $hsacoVersion = 0; -my $funcSupp = 1; # enable function support -my $rdc = 0; # whether -fgpu-rdc is on - -my @options = (); -my @inputs = (); - -if ($verbose & 0x4) { - print "hipcc-args: ", join (" ", @ARGV), "\n"; -} - -# Handle code object generation -my $ISACMD=""; -if($HIP_PLATFORM eq "nvidia"){ - $ISACMD .= "$HIP_PATH/bin/hipcc -ptx "; - if($ARGV[0] eq "--genco"){ - foreach $isaarg (@ARGV[1..$#ARGV]){ - $ISACMD .= " "; - # use the headers from rocm-path or hip-path - if (($isaarg =~ /--rocm-path/) or ($isaarg =~ /--hip-path/)) { - my @header_path = split('=', $isaarg); - $ISACMD .= '-I' . $header_path[1] .'/include'; - } else { - $ISACMD .= $isaarg; - } - } - if ($verbose & 0x1) { - print "hipcc-cmd: ", $ISACMD, "\n"; - } - system($ISACMD) and die(); - exit(0); - } -} - -# TODO: convert toolArgs to an array rather than a string -my $toolArgs = ""; # arguments to pass to the clang or nvcc tool -my $optArg = ""; # -O args - -# TODO: hipcc uses --amdgpu-target for historical reasons. It should be replaced -# by clang option --offload-arch. -my @targetOpts = ('--offload-arch=', '--amdgpu-target='); - -my $targetsStr = ""; -my $skipOutputFile = 0; # file followed by -o should not contibute in picking compiler flags -my $prevArg = ""; # previous argument - -foreach $arg (@ARGV) -{ - # Save $arg, it can get changed in the loop. - $trimarg = $arg; - # TODO: figure out why this space removal is wanted. - # TODO: If someone has gone to the effort of quoting the spaces to the shell - # TODO: why are we removing it here? - $trimarg =~ s/^\s+|\s+$//g; # Remive whitespace - my $swallowArg = 0; - my $escapeArg = 1; - if ($HIP_PLATFORM eq "nvidia") { - if (($trimarg =~ /--rocm-path/) or ($trimarg =~ /--hip-path/)) { - next; - } - } - if ($arg eq '-c' or $arg eq '--genco' or $arg eq '-E') { - $compileOnly = 1; - $needLDFLAGS = 0; - } - - if ($skipOutputFile) { - # TODO: handle filename with shell metacharacters - $toolArgs .= " " . get_normalized_path("$arg"); - $prevArg = $arg; - $skipOutputFile = 0; - next; - } - - if ($arg eq '-o') { - $needLDFLAGS = 1; - $skipOutputFile = 1; - } - - if(($trimarg eq '-stdlib=libc++') and ($setStdLib eq 0)) - { - $HIPCXXFLAGS .= " -stdlib=libc++"; - $setStdLib = 1; - } - - # Check target selection option: --offload-arch= and --amdgpu-target=... - foreach my $targetOpt (@targetOpts) { - if (substr($arg, 0, length($targetOpt)) eq $targetOpt) { - if ($targetOpt eq '--amdgpu-target=') { - print "Warning: The --amdgpu-target option has been deprecated and will be removed in the future. Use --offload-arch instead.\n"; - } - # If targets string is not empty, add a comma before adding new target option value. - $targetsStr .= ($targetsStr ? ',' : ''); - $targetsStr .= substr($arg, length($targetOpt)); - $default_amdgpu_target = 0; - # Collect the GPU arch options and pass them to clang later. - if ($HIP_PLATFORM eq "amd") { - $swallowArg = 1; - } - } - } - - if (($arg =~ /--genco/) and $HIP_PLATFORM eq 'amd' ) { - $arg = "--cuda-device-only"; - } - - if($trimarg eq '--version') { - $printHipVersion = 1; - } - if($trimarg eq '--short-version') { - $printHipVersion = 1; - $runCmd = 0; - } - if($trimarg eq '--cxxflags') { - $printCXXFlags = 1; - $runCmd = 0; - } - if($trimarg eq '--ldflags') { - $printLDFlags = 1; - $runCmd = 0; - } - if($trimarg eq '-M') { - $compileOnly = 1; - $buildDeps = 1; - } - if($trimarg eq '-use-staticlib') { - print "Warning: The -use-staticlib option has been deprecated and is no longer needed.\n" - } - if($trimarg eq '-use-sharedlib') { - print "Warning: The -use-sharedlib option has been deprecated and is no longer needed.\n" - } - if($arg =~ m/^-O/) - { - $optArg = $arg; - } - if($arg =~ '--amdhsa-code-object-version=') - { - print "Warning: The --amdhsa-code-object-version option has been deprecated and will be removed in the future. Use -mcode-object-version instead.\n"; - $arg =~ s/--amdhsa-code-object-version=//; - $hsacoVersion = $arg; - $swallowArg = 1; - } - - # nvcc does not handle standard compiler options properly - # This can prevent hipcc being used as standard CXX/C Compiler - # To fix this we need to pass -Xcompiler for options - if (($arg eq '-fPIC' or $arg =~ '-Wl,') and $HIP_COMPILER eq 'nvcc') - { - $HIPCXXFLAGS .= " -Xcompiler ".$arg; - $swallowArg = 1; - } - - if ($arg eq '-x') { - $fileTypeFlag = 1; - } elsif (($arg eq 'c' and $prevArg eq '-x') or ($arg eq '-xc')) { - $fileTypeFlag = 1; - $hasC = 1; - $hasCXX = 0; - $hasHIP = 0; - } elsif (($arg eq 'c++' and $prevArg eq '-x') or ($arg eq '-xc++')) { - $fileTypeFlag = 1; - $hasC = 0; - $hasCXX = 1; - $hasHIP = 0; - } elsif (($arg eq 'hip' and $prevArg eq '-x') or ($arg eq '-xhip')) { - $fileTypeFlag = 1; - $hasC = 0; - $hasCXX = 0; - $hasHIP = 1; - } elsif ($arg =~ '-fopenmp-targets=') { - $hasOMPTargets = 1; - } elsif ($arg =~ m/^-/) { - # options start with - - if ($arg eq '-fgpu-rdc') { - $rdc = 1; - } elsif ($arg eq '-fno-gpu-rdc') { - $rdc = 0; - } - - # Process HIPCC options here: - if ($arg =~ m/^--hipcc/) { - $swallowArg = 1; - if ($arg eq "--hipcc-func-supp") { - print "Warning: The --hipcc-func-supp option has been deprecated and will be removed in the future.\n"; - $funcSupp = 1; - } elsif ($arg eq "--hipcc-no-func-supp") { - print "Warning: The --hipcc-no-func-supp option has been deprecated and will be removed in the future.\n"; - $funcSupp = 0; - } - } else { - push (@options, $arg); - } - #print "O: <$arg>\n"; - } elsif ($prevArg ne '-o') { - # input files and libraries - # Skip guessing if `-x {c|c++|hip}` is already specified. - - # Add proper file extension before each file type - # File Extension -> Flag - # .c -> -x c - # .cpp/.cxx/.cc/.cu/.cuh/.hip -> -x hip - if ($fileTypeFlag eq 0) { - if ($arg =~ /\.c$/) { - $hasC = 1; - $needCFLAGS = 1; - $toolArgs .= " -x c"; - } elsif (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.cc$/) or ($arg =~ /\.C$/)) { - $needCXXFLAGS = 1; - if ($HIP_COMPILE_CXX_AS_HIP eq '0' or $HIP_PLATFORM ne "amd" or $hasOMPTargets eq 1) { - $hasCXX = 1; - if ($HIP_PLATFORM eq "nvidia") { - $toolArgs .= " -x cu"; - } - } elsif ($HIP_PLATFORM eq "amd") { - $hasHIP = 1; - $toolArgs .= " -x hip"; - } - } elsif ((($arg =~ /\.cu$/ or $arg =~ /\.cuh$/) and $HIP_COMPILE_CXX_AS_HIP ne '0') or ($arg =~ /\.hip$/)) { - $needCXXFLAGS = 1; - if ($HIP_PLATFORM eq "amd") { - $hasHIP = 1; - $toolArgs .= " -x hip"; - } elsif ($HIP_PLATFORM eq "nvidia") { - $toolArgs .= " -x cu"; - } - } - } - if ($hasC) { - $needCFLAGS = 1; - } elsif ($hasCXX or $hasHIP) { - $needCXXFLAGS = 1; - } - push (@inputs, $arg); - #print "I: <$arg>\n"; - } - # Produce a version of $arg where characters significant to the shell are - # quoted. One could quote everything of course but don't bother for - # common characters such as alphanumerics. - # Do the quoting here because sometimes the $arg is changed in the loop - # Important to have all of '-Xlinker' in the set of unquoted characters. - if (not $isWindows and $escapeArg) { - $arg =~ s/[^-a-zA-Z0-9_=+,.\/]/\\$&/g; - } - $toolArgs .= " $arg" unless $swallowArg; - $prevArg = $arg; -} - -if($HIP_PLATFORM eq "amd"){ - # No AMDGPU target specified at commandline. So look for HCC_AMDGPU_TARGET - if($default_amdgpu_target eq 1) { - if (defined $ENV{HCC_AMDGPU_TARGET}) { - $targetsStr = $ENV{HCC_AMDGPU_TARGET}; - } elsif (not $isWindows) { - # Else try using rocm_agent_enumerator - $ROCM_AGENT_ENUM = "${ROCM_PATH}/bin/rocm_agent_enumerator"; - $targetsStr = `${ROCM_AGENT_ENUM} -t GPU`; - $targetsStr =~ s/\n/,/g; - } - $default_amdgpu_target = 0; - } - - # Parse the targets collected in targetStr and set corresponding compiler options. - my @targets = split(',', $targetsStr); - $GPU_ARCH_OPT = " --offload-arch="; - - foreach my $val (@targets) { - # Ignore 'gfx000' target reported by rocm_agent_enumerator. - if ($val ne 'gfx000') { - my @procAndFeatures = split(':', $val); - $len = scalar @procAndFeatures; - my $procName; - if($len ge 1 and $len le 3) { # proc and features - $procName = $procAndFeatures[0]; - for my $i (1 .. $#procAndFeatures) { - if (grep($procAndFeatures[$i], @knownFeatures) eq 0) { - print "Warning: The Feature: $procAndFeatures[$i] is unknown. Correct compilation is not guaranteed.\n"; - } - } - } else { - $procName = $val; - } - $GPU_ARCH_ARG = $GPU_ARCH_OPT . $val; - $HIPLDARCHFLAGS .= $GPU_ARCH_ARG; - if ($HIP_PLATFORM eq 'amd' and $hasHIP) { - $HIPCXXFLAGS .= $GPU_ARCH_ARG; - } - } - } - if ($hsacoVersion > 0) { - if ($compileOnly eq 0) { - $HIPLDFLAGS .= " -mcode-object-version=$hsacoVersion"; - } else { - $HIPCXXFLAGS .= " -mcode-object-version=$hsacoVersion"; - } - } - - # rocm_agent_enumerator failed! Throw an error and die if linking is required - if ($default_amdgpu_target eq 1 and $compileOnly eq 0) { - print "No valid AMD GPU target was either specified or found. Please specify a valid target using --offload-arch=.\n" and die(); - } - - $ENV{HCC_EXTRA_LIBRARIES}="\n"; -} - -if ($buildDeps and $HIP_PLATFORM eq 'nvidia') { - $HIPCXXFLAGS .= " -M -D__CUDACC__"; - $HIPCFLAGS .= " -M -D__CUDACC__"; -} - -if ($buildDeps and $HIP_PLATFORM eq 'amd') { - $HIPCXXFLAGS .= " --cuda-host-only"; -} - -# hipcc currrently requires separate compilation of source files, ie it is not possible to pass -# CPP files combined with .O files -# Reason is that NVCC uses the file extension to determine whether to compile in CUDA mode or -# pass-through CPP mode. - -if ($HIP_PLATFORM eq "amd") { - # Set default optimization level to -O3 for hip-clang. - if ($optArg eq "") { - $HIPCXXFLAGS .= " -O3"; - $HIPCFLAGS .= " -O3"; - $HIPLDFLAGS .= " -O3"; - } - if (!$funcSupp and $optArg ne "-O0" and $hasHIP) { - $HIPCXXFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false"; - if ($needLDFLAGS and not $needCXXFLAGS) { - $HIPLDFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false"; - } - } - - # If the HIP_PATH env var is defined, pass that path to Clang - if ($ENV{'HIP_PATH'}) { - my $hip_path_flag = " --hip-path=" . get_normalized_path("$HIP_PATH"); - $HIPCXXFLAGS .= $hip_path_flag; - $HIPLDFLAGS .= $hip_path_flag; - } - - if ($hasHIP) { - if (defined $DEVICE_LIB_PATH) { - $HIPCXXFLAGS .= " --hip-device-lib-path=" . get_normalized_path("$DEVICE_LIB_PATH"); - } - } - - if (!$compileOnly) { - $HIPLDFLAGS .= " --hip-link"; - if ($rdc) { - $HIPLDFLAGS .= $HIPLDARCHFLAGS; - } - if (not $isWindows) { - $HIPLDFLAGS .= " --rtlib=compiler-rt -unwindlib=libgcc"; - - } - } -} - -# TODO: convert CMD to an array rather than a string -my $CMD="$HIPCC"; - -if ($needCFLAGS) { - $CMD .= " $HIPCFLAGS"; -} - -if ($needCXXFLAGS) { - $CMD .= " $HIPCXXFLAGS"; -} - -if ($needLDFLAGS and not $compileOnly) { - $CMD .= " $HIPLDFLAGS"; -} -$CMD .= " $toolArgs"; - -if (($needCFLAGS or $needCXXFLAGS) and $HIPCC_COMPILE_FLAGS_APPEND) { - $CMD .= " $HIPCC_COMPILE_FLAGS_APPEND"; -} - -if ($needLDFLAGS and not $compileOnly and $HIPCC_LINK_FLAGS_APPEND) { - $CMD .= " $HIPCC_LINK_FLAGS_APPEND"; -} - -if ($verbose & 0x1) { - print "hipcc-cmd: ", $CMD, "\n"; -} - -if ($printHipVersion) { - if ($runCmd) { - print "HIP version: " - } - print $HIP_VERSION, "\n"; -} -if ($printCXXFlags) { - print $HIPCXXFLAGS; -} -if ($printLDFlags) { - print $HIPLDFLAGS; -} -if ($runCmd) { - system ("$CMD"); - if ($? == -1) { - print "failed to execute: $!\n"; - exit($?); - } - elsif ($? & 127) { - printf "child died with signal %d, %s coredump\n", - ($? & 127), ($? & 128) ? 'with' : 'without'; - exit($?); - } - else { - $CMD_EXIT_CODE = $? >> 8; - } - exit($CMD_EXIT_CODE); -} - -# vim: ts=4:sw=4:expandtab:smartindent diff --git a/amd/hipcc/bin/hipconfig.pl b/amd/hipcc/bin/hipconfig.pl deleted file mode 100755 index e5b13972aa8bf..0000000000000 --- a/amd/hipcc/bin/hipconfig.pl +++ /dev/null @@ -1,239 +0,0 @@ -#!/usr/bin/env perl -# Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -# Need perl > 5.10 to use logic-defined or -use 5.006; use v5.10.1; -use warnings; -use Getopt::Long; -use Cwd; - -# Return name of HIP compiler - either 'clang' or 'nvcc' -# -use Getopt::Long; -use File::Basename; - -my $base_dir; -BEGIN { - $base_dir = dirname( Cwd::realpath(__FILE__) ); -} -use lib "$base_dir/"; -use hipvars; - -$isWindows = $hipvars::isWindows; -$HIP_RUNTIME = $hipvars::HIP_RUNTIME; -$HIP_PLATFORM = $hipvars::HIP_PLATFORM; -$HIP_COMPILER = $hipvars::HIP_COMPILER; -$HIP_CLANG_PATH = $hipvars::HIP_CLANG_PATH; -$CUDA_PATH = $hipvars::CUDA_PATH; -$HIP_PATH = $hipvars::HIP_PATH; -$ROCM_PATH = $hipvars::ROCM_PATH; -$HIP_VERSION = $hipvars::HIP_VERSION; - -Getopt::Long::Configure ( qw{bundling no_ignore_case}); -GetOptions( - "help|h" => \$p_help - ,"path|p" => \$p_path - ,"rocmpath|R" => \$p_rocmpath - ,"compiler|c" => \$p_compiler - ,"platform|P" => \$p_platform - ,"runtime|r" => \$p_runtime - ,"hipclangpath|l" => \$p_hipclangpath - ,"cpp_config|cxx_config|C" => \$p_cpp_config - ,"full|f|info" => \$p_full, - ,"version|v" => \$p_version, - ,"check" => \$p_check, - ,"newline|n" => \$p_newline -); - -if ($HIP_COMPILER eq "clang") { - $HIP_CLANG_INCLUDE = ""; - if($isWindows) { - $HIP_CLANG_INCLUDE = `\"$HIP_CLANG_PATH/clang++\" --print-resource-dir`; - } else { - $HIP_CLANG_INCLUDE = `$HIP_CLANG_PATH/clang++ --print-resource-dir`; - chomp($HIP_CLANG_INCLUDE) - } - - $CPP_CONFIG = " -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__="; - - $HIP_PATH_INCLUDE = $HIP_PATH."/include"; - if($isWindows) { - $CPP_CONFIG .= " -I\"$HIP_PATH_INCLUDE\" -I\"$HIP_CLANG_INCLUDE\""; - } else { - $CPP_CONFIG .= " -I$HIP_PATH_INCLUDE -I$HIP_CLANG_INCLUDE "; - } -} -if ($HIP_PLATFORM eq "nvidia") { - $CPP_CONFIG = " -D__HIP_PLATFORM_NVCC__= -D__HIP_PLATFORM_NVIDIA__= -I$HIP_PATH/include -I$CUDA_PATH/include"; -}; - -if ($p_help) { - print "usage: hipconfig [OPTIONS]\n"; - print " --path, -p : print HIP_PATH (use env var if set, else determine from hipconfig path)\n"; - print " --rocmpath, -R : print ROCM_PATH (use env var if set, else determine from hip path or /opt/rocm)\n"; - print " --cpp_config, -C : print C++ compiler options\n"; - print " --compiler, -c : print compiler (clang or nvcc)\n"; - print " --platform, -P : print platform (amd or nvidia)\n"; - print " --runtime, -r : print runtime (rocclr or cuda)\n"; - print " --hipclangpath, -l : print HIP_CLANG_PATH\n"; - print " --full, -f : print full config\n"; - print " --version, -v : print hip version\n"; - print " --check : check configuration\n"; - print " --newline, -n : print newline\n"; - print " --help, -h : print help message\n"; - exit(); -} - -if ($p_path) { - print "$HIP_PATH"; - $printed = 1; -} - -if ($p_rocmpath) { - print "$ROCM_PATH"; - $printed = 1; -} - -if ($p_cpp_config) { - print $CPP_CONFIG; - $printed = 1; -} - -if ($p_compiler) { - print $HIP_COMPILER; - $printed = 1; -} - -if ($p_platform) { - print $HIP_PLATFORM; - $printed = 1; -} - -if ($p_runtime) { - print $HIP_RUNTIME; - $printed = 1; -} - -if ($p_hipclangpath) { - if (defined $HIP_CLANG_PATH) { - print $HIP_CLANG_PATH; - } - $printed = 1; -} - -if ($p_version) { - print $HIP_VERSION; - $printed = 1; -} - -if (!$printed or $p_full) { - print "HIP version : ", $HIP_VERSION, "\n\n"; - print "== hipconfig\n"; - print "HIP_PATH : ", $HIP_PATH, "\n"; - print "ROCM_PATH : ", $ROCM_PATH, "\n"; - print "HIP_COMPILER : ", $HIP_COMPILER, "\n"; - print "HIP_PLATFORM : ", $HIP_PLATFORM, "\n"; - print "HIP_RUNTIME : ", $HIP_RUNTIME, "\n"; - print "CPP_CONFIG : ", $CPP_CONFIG, "\n"; - if ($HIP_PLATFORM eq "amd") - { - print "\n" ; - if ($HIP_COMPILER eq "clang") - { - print "== hip-clang\n"; - print ("HIP_CLANG_PATH : $HIP_CLANG_PATH\n"); - if ($isWindows) { - system("\"$HIP_CLANG_PATH/clang++\" --version"); - system("\"$HIP_CLANG_PATH/llc\" --version"); - printf("hip-clang-cxxflags : "); - $win_output = `perl \"$HIP_PATH/bin/hipcc\" --cxxflags`; - printf("$win_output \n"); - printf("hip-clang-ldflags : "); - $win_output = `perl \"$HIP_PATH/bin/hipcc\" --ldflags`; - printf("$win_output \n"); - } else { - system("$HIP_CLANG_PATH/clang++ --version"); - system("$HIP_CLANG_PATH/llc --version"); - print ("hip-clang-cxxflags : "); - system("$HIP_PATH/bin/hipcc --cxxflags"); - printf("\n"); - print ("hip-clang-ldflags : "); - system("$HIP_PATH/bin/hipcc --ldflags"); - printf("\n"); - } - } else { - print ("Unexpected HIP_COMPILER: $HIP_COMPILER\n"); - } - } - if ($HIP_PLATFORM eq "nvidia") { - print "\n" ; - print "== nvcc\n"; - print "CUDA_PATH : ", $CUDA_PATH, "\n"; - system("$CUDA_PATH/bin/nvcc --version"); - - } - print "\n" ; - - print "=== Environment Variables\n"; - if ($isWindows) { - print ("PATH=$ENV{PATH}\n"); - system("set | findstr //B //C:\"HIP\" //C:\"CUDA\" //C:\"LD_LIBRARY_PATH\""); - } else { - system("echo PATH=\$PATH"); - system("env | egrep '^HIP|^CUDA|^LD_LIBRARY_PATH'"); - } - - - print "\n" ; - if ($isWindows) { - print "== Windows Display Drivers\n"; - print "Hostname : "; system ("hostname"); - system ("wmic path win32_VideoController get AdapterCompatibility,InstalledDisplayDrivers,Name | findstr //B //C:\"Advanced Micro Devices\""); - } else { - print "== Linux Kernel\n"; - print "Hostname : "; system ("hostname"); - system ("uname -a"); - } - - if (-e "/usr/bin/lsb_release") { - system ("/usr/bin/lsb_release -a"); - } - - print "\n" ; - $printed = 1; -} - - -if ($p_check) { - print "\nCheck system installation:\n"; - - printf ("%-70s", "check hipconfig in PATH..."); - # Safer to use which hipconfig instead of invoking hipconfig - if (system ("which hipconfig > /dev/null 2>&1") != 0) { - print "FAIL\n"; - } else { - printf "good\n"; - } -} - -if ($p_newline) { - print "\n"; -} diff --git a/amd/hipcc/docs/env.rst b/amd/hipcc/docs/env.rst index 268a137aa3ace..31cf486858c77 100644 --- a/amd/hipcc/docs/env.rst +++ b/amd/hipcc/docs/env.rst @@ -8,7 +8,9 @@ HIPCC environment variables ****************************************** -The environment variable ``HIP_PLATFORM`` can be used to specify ``amd`` or ``nvidia`` depending on the available backend tool flows: +This topic provides descriptions of the HIPCC environment +variables. For more information about other ROCm environment variables, see +`HIP environment variables `_. * ``HIP_PLATFORM`` = ``amd`` or ``HIP_PLATFORM`` = ``nvidia`` diff --git a/amd/hipcc/docs/index.rst b/amd/hipcc/docs/index.rst index 7ffef516cc1b8..642e501cd82a0 100644 --- a/amd/hipcc/docs/index.rst +++ b/amd/hipcc/docs/index.rst @@ -8,12 +8,12 @@ HIPCC documentation ****************************************** -``hipcc`` is a compiler driver utility that will call clang or nvcc, depending on target, and pass the appropriate include and library options for the target compiler and HIP infrastructure. +.. note:: + ROCm provides and supports multiple compilers as described in `ROCm compiler reference `_. -There are both C++ and Perl executable versions of the ``hipcc`` and ``hipconfig`` compiler driver utilities provided. By default the C++ version is used when ``hipcc`` is run. +``hipcc`` is a compiler driver utility that will call ``clang`` or ``nvcc``, depending on target, and pass the appropriate include and library options for the target compiler and HIP infrastructure. C++ executable versions of ``hipcc`` and ``hipconfig`` compiler driver utilities are provided. -.. note:: - You can manually run the Perl scripts using hipcc.pl and hipconfig.pl from the installation. However, you must ensure Perl is installed on the system for the scripts to work. Perl is not automatically installed with the ROCm installation. +The HIPCC public repository is located at `https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc `_ The documentation is structured as follows: diff --git a/amd/hipcc/docs/test.md b/amd/hipcc/docs/test.md deleted file mode 100644 index 890232e52bad6..0000000000000 --- a/amd/hipcc/docs/test.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -myst: - html_meta: - "description lang=en": "Testing HIPCC command" - "keywords": "HIPCC, ROCm, HIP tools, HIP compiler" ---- - -# Testing - -Currently hipcc/hipconfig executables are tested by building and executing HIP tests. Separate tests for hipcc/hipconfig are not planned. diff --git a/amd/hipcc/docs/usage.rst b/amd/hipcc/docs/usage.rst index 0d7a2d5edeee4..7b1c5ed845fe0 100644 --- a/amd/hipcc/docs/usage.rst +++ b/amd/hipcc/docs/usage.rst @@ -8,8 +8,7 @@ Using HIPCC ****************************************** -The built executables can be used the same way as the ``hipcc`` and ``hipconfig`` Perl scripts. -To use the newly built executables from the build folder use ``./`` in front of the executable name. +To use the newly built ``hipcc`` and ``hipconfig`` executables from the build folder use ``./`` in front of the executable name. For example: .. code-block:: shell @@ -18,4 +17,3 @@ For example: ./hipcc --help ./hipcc --version ./hipconfig --full - diff --git a/amd/hipcc/src/hipBin_amd.h b/amd/hipcc/src/hipBin_amd.h index 7018727809db4..ecea39e071b40 100644 --- a/amd/hipcc/src/hipBin_amd.h +++ b/amd/hipcc/src/hipBin_amd.h @@ -51,7 +51,7 @@ class HipBinAmd : public HipBinBase { public: HipBinAmd(); - virtual ~HipBinAmd() = default; + ~HipBinAmd() override = default; virtual bool detectPlatform(); virtual void constructCompilerPath(); virtual const string& getCompilerPath() const; diff --git a/amd/hipcc/src/hipBin_base.h b/amd/hipcc/src/hipBin_base.h index 94b498a255dd1..ea37e6fd12fc8 100644 --- a/amd/hipcc/src/hipBin_base.h +++ b/amd/hipcc/src/hipBin_base.h @@ -203,6 +203,7 @@ enum HipBinCommand { class HipBinBase { public: HipBinBase(); + virtual ~HipBinBase() = default; // Interface functions virtual void constructCompilerPath() = 0; virtual void printFull() = 0; diff --git a/amd/hipcc/src/hipBin_nvidia.h b/amd/hipcc/src/hipBin_nvidia.h index 691055fbdf3d4..c2adeec81da23 100644 --- a/amd/hipcc/src/hipBin_nvidia.h +++ b/amd/hipcc/src/hipBin_nvidia.h @@ -37,7 +37,7 @@ class HipBinNvidia : public HipBinBase { public: HipBinNvidia(); - virtual ~HipBinNvidia() = default; + ~HipBinNvidia() override = default; virtual bool detectPlatform(); virtual void constructCompilerPath(); virtual const string& getCompilerPath() const; diff --git a/clang/docs/ClangOffloadBundler.rst b/clang/docs/ClangOffloadBundler.rst index bceb4060992fc..5570dbb08ab9a 100644 --- a/clang/docs/ClangOffloadBundler.rst +++ b/clang/docs/ClangOffloadBundler.rst @@ -266,15 +266,14 @@ without differentiation based on offload kind. The target triple of the code object. See `Target Triple `_. - The bundler accepts target triples with or without the optional environment - field: + LLVM target triples can be with or without the optional environment field: ``--``, or ``---`` - However, in order to standardize outputs for tools that consume bitcode - bundles, bundles written by the bundler internally use only the 4-field - target triple: + However, in order to standardize outputs for tools that consume bitcode bundles + and to parse target ID containing dashes, the bundler only accepts target + triples in the 4-field format: ``---`` @@ -526,15 +525,15 @@ The compressed offload bundle begins with a header followed by the compressed bi This is a unique identifier to distinguish compressed offload bundles. The value is the string 'CCOB' (Compressed Clang Offload Bundle). - **Version Number (16-bit unsigned int)**: - This denotes the version of the compressed offload bundle format. The current version is `2`. + This denotes the version of the compressed offload bundle format. The current version is `3`. - **Compression Method (16-bit unsigned int)**: This field indicates the compression method used. The value corresponds to either `zlib` or `zstd`, represented as a 16-bit unsigned integer cast from the LLVM compression enumeration. -- **Total File Size (32-bit unsigned int)**: +- **Total File Size (unsigned int, 32-bit in v2, 64-bit in v3)**: This is the total size (in bytes) of the file, including the header. Available in version 2 and above. -- **Uncompressed Binary Size (32-bit unsigned int)**: +- **Uncompressed Binary Size (unsigned int, 32-bit in v2, 64-bit in v3)**: This is the size (in bytes) of the binary data before it was compressed. - **Hash (64-bit unsigned int)**: @@ -543,4 +542,4 @@ The compressed offload bundle begins with a header followed by the compressed bi - **Compressed Data**: The actual compressed binary data follows the header. Its size can be inferred from the total size of the file minus the header size. - > **Note**: Version 3 of the format is under development. It uses 64-bit fields for Total File Size and Uncompressed Binary Size to support files larger than 4GB. To experiment with version 3, set the environment variable `COMPRESSED_BUNDLE_FORMAT_VERSION=3`. This support is experimental and not recommended for production use. \ No newline at end of file + > **Note**: Version 3 is now the default format. For backward compatibility with older HIP runtimes that support version 2 only, set the environment variable `COMPRESSED_BUNDLE_FORMAT_VERSION=2`. diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst index 481ed39230813..33cec9f2f47c5 100644 --- a/clang/docs/HIPSupport.rst +++ b/clang/docs/HIPSupport.rst @@ -498,7 +498,7 @@ Predefined Macros * - ``__HIPSTDPAR__`` - Defined when Clang is compiling code in algorithm offload mode, enabled with the ``--hipstdpar`` compiler option. - * - ``__HIPSTDPAR_INTERPOSE_ALLOC__`` + * - ``__HIPSTDPAR_INTERPOSE_ALLOC__`` / ``__HIPSTDPAR_INTERPOSE_ALLOC_V1__`` - Defined only when compiling in algorithm offload mode, when the user enables interposition mode with the ``--hipstdpar-interpose-alloc`` compiler option, indicating that all dynamic memory allocation / diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 53c5955441ed6..4cfc574274daa 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -4788,6 +4788,118 @@ If no address spaces names are provided, all address spaces are fenced. __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local") __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global") +__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``__builtin_amdgcn_processor_is`` and ``__builtin_amdgcn_is_invocable`` provide +a functional mechanism for programatically querying: + +* the identity of the current target processor; +* the capability of the current target processor to invoke a particular builtin. + +**Syntax**: + +.. code-block:: c + + __amdgpu_feature_predicate_t __builtin_amdgcn_processor_is(const char*); + __amdgpu_feature_predicate_t __builtin_amdgcn_is_invocable(builtin_name); + +**Example of use**: + +.. code-block:: c++ + + if (__builtin_amdgcn_processor_is("gfx1201") || + __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) + __builtin_amdgcn_s_sleep_var(x); + + if (!__builtin_amdgcn_processor_is("gfx906")) + __builtin_amdgcn_s_wait_event_export_ready(); + else if (__builtin_amdgcn_processor_is("gfx1010") || + __builtin_amdgcn_processor_is("gfx1101")) + __builtin_amdgcn_s_ttracedata_imm(1); + + while (__builtin_amdgcn_processor_is("gfx1101")) *p += x; + + do { + break; + } while (__builtin_amdgcn_processor_is("gfx1010")); + + for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break; + + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)) + __builtin_amdgcn_s_wait_event_export_ready(); + else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm)) + __builtin_amdgcn_s_ttracedata_imm(1); + + do { + break; + } while ( + __builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32)); + + for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) + break; + +**Description**: + +The builtins return a value of type ``__amdgpu_feature_predicate_t``, which is a +target specific type that behaves as if its C++ definition was the following: + +.. code-block:: c++ + + struct __amdgpu_feature_predicate_t { + __amdgpu_feature_predicate_t() = delete; + __amdgpu_feature_predicate_t(const __amdgpu_feature_predicate_t&) = delete; + __amdgpu_feature_predicate_t(__amdgpu_feature_predicate_t&&) = delete; + + explicit + operator bool() const noexcept; + }; + +The builtins can be used in C as well, wherein the +``__amdgpu_feature_predicate_t`` type behaves as an opaque, forward declared +type with conditional automated conversion to ``_Bool`` when used as the +predicate argument to a control structure: + +.. code-block:: c + + struct __amdgpu_feature_predicate_t ret(); // Error + void arg(struct __amdgpu_feature_predicate_t); // Error + void local() { + struct __amdgpu_feature_predicate_t x; // Error + struct __amdgpu_feature_predicate_t y = + __builtin_amdgcn_processor_is("gfx900"); // Error + } + void valid_use() { + _Bool x = (_Bool)__builtin_amdgcn_processor_is("gfx900"); // OK + if (__builtin_amdgcn_processor_is("gfx900")) // Implicit cast to _Bool + return; + for (; __builtin_amdgcn_processor_is("gfx900");) // Implicit cast to _Bool + break; + while (__builtin_amdgcn_processor_is("gfx900")) // Implicit cast to _Bool + break; + do { + break; + } while (__builtin_amdgcn_processor_is("gfx900")); // Implicit cast to _Bool + + __builtin_amdgcn_processor_is("gfx900") ? x : !x; + } + +The boolean interpretation of the predicate values returned by the builtins: + +* indicates whether the current target matches the argument; the argument MUST + be a string literal and a valid AMDGPU target +* indicates whether the builtin function passed as the argument can be invoked + by the current target; the argument MUST be either a generic or AMDGPU + specific builtin name + +When invoked while compiling for a concrete target, the builtins are evaluated +early by Clang, and never produce any CodeGen effects / have no observable +side-effects in IR. Conversely, when compiling for AMDGCN flavoured SPIR-v, +which is an abstract target, a series of predicate values are implicitly +created. These predicates get resolved when finalizing the compilation process +for a concrete target, and shall reflect the latter's identity and features. +Thus, it is possible to author high-level code, in e.g. HIP, that is target +adaptive in a dynamic fashion, contrary to macro based mechanisms. ARM/AArch64 Language Extensions ------------------------------- diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 7cd39c98173c3..06a1cb37282da 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -466,6 +466,7 @@ Non-comprehensive list of changes in this release - The floating point comparison builtins (``__builtin_isgreater``, ``__builtin_isgreaterequal``, ``__builtin_isless``, etc.) and ``__builtin_signbit`` can now be used in constant expressions. +- For AMDPGU targets, added `__builtin_v_cvt_off_f32_i4` that maps to the `v_cvt_off_f32_i4` instruction. - Plugins can now define custom attributes that apply to statements as well as declarations. - ``__builtin_abs`` function can now be used in constant expressions. @@ -1077,6 +1078,9 @@ Bug Fixes to C++ Support template parameter. Now, such expression can be used with ``static_assert`` and ``constexpr``. (#GH123498) - Correctly determine the implicit constexprness of lambdas in dependent contexts. (#GH97958) (#GH114234) - Fix that some dependent immediate expressions did not cause immediate escalation (#GH119046) +- Fixes matching of nested template template parameters. (#GH130362) +- Correctly diagnoses template template paramters which have a pack parameter + not in the last position. Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1137,6 +1141,12 @@ AMDGPU Support definitions for GPU builtin functions. This header can be included for OpenMP, CUDA, HIP, OpenCL, and C/C++. +- Bump the default code object version to 6. +- Introduced a new target specific builtin ``__builtin_amdgcn_processor_is``, + a late / deferred query for the current target processor +- Introduced a new target specific builtin ``__builtin_amdgcn_is_invocable``, + which enables fine-grained, per-builtin, feature availability + NVPTX Support ^^^^^^^^^^^^^^ @@ -1264,6 +1274,8 @@ CUDA/HIP Language Changes - Fixed a bug about overriding a constexpr pure-virtual member function with a non-constexpr virtual member function which causes compilation failure when including standard C++ header `format`. - Added initial support for version 3 of the compressed offload bundle format, which uses 64-bit fields for Total File Size and Uncompressed Binary Size. This enables support for files larger than 4GB. The support is currently experimental and can be enabled by setting the environment variable `COMPRESSED_BUNDLE_FORMAT_VERSION=3`. +* Provide a __device__ version of std::__glibcxx_assert_fail() in a header wrapper. + CUDA Support ^^^^^^^^^^^^ - Clang now supports CUDA SDK up to 12.6 diff --git a/clang/include/clang/Basic/AMDGPUTypes.def b/clang/include/clang/Basic/AMDGPUTypes.def index d3dff446f9edf..a0574c640184b 100644 --- a/clang/include/clang/Basic/AMDGPUTypes.def +++ b/clang/include/clang/Basic/AMDGPUTypes.def @@ -20,10 +20,18 @@ AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) #endif +#ifndef AMDGPU_FEATURE_PREDICATE_TYPE +#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \ + AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) +#endif + AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_buffer_rsrc_t", AMDGPUBufferRsrc, AMDGPUBufferRsrcTy, 128, 128, 8) AMDGPU_NAMED_BARRIER_TYPE("__amdgpu_named_workgroup_barrier_t", AMDGPUNamedWorkgroupBarrier, AMDGPUNamedWorkgroupBarrierTy, 128, 32, 0) +AMDGPU_FEATURE_PREDICATE_TYPE("__amdgpu_feature_predicate_t", AMDGPUFeaturePredicate, AMDGPUFeaturePredicateTy, 1, 1) + #undef AMDGPU_TYPE #undef AMDGPU_OPAQUE_PTR_TYPE #undef AMDGPU_NAMED_BARRIER_TYPE +#undef AMDGPU_FEATURE_PREDICATE_TYPE diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def index 48437c9397570..27f78af16fe06 100644 --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -34,6 +34,7 @@ // Q -> target builtin type, followed by a character to distinguish the builtin type // Qa -> AArch64 svcount_t builtin type. // Qb -> AMDGPU __amdgpu_buffer_rsrc_t builtin type. +// Qc -> AMDGPU __amdgpu_feature_predicate_t builtin type. // E -> ext_vector, followed by the number of elements and the base type. // X -> _Complex, followed by the base type. // Y -> ptrdiff_t diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 1b29a8e359c20..f3b857ee7b1d9 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -140,6 +140,7 @@ BUILTIN(__builtin_amdgcn_cvt_pknorm_u16, "E2Usff", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_i16, "E2sii", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_u16, "E2UsUiUi", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_u8_f32, "UifUiUi", "nc") +BUILTIN(__builtin_amdgcn_cvt_off_f32_i4, "fi", "nc") BUILTIN(__builtin_amdgcn_sad_u8, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_msad_u8, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_sad_hi_u8, "UiUiUiUi", "nc") @@ -162,6 +163,8 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b64, "V2UiQbiiIi", "n") BUILTIN(__builtin_amdgcn_raw_buffer_load_b96, "V3UiQbiiIi", "n") BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n") +TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_load_lds, "vQbv*3IUiiiIiIi", "t", "vmem-to-lds-load-insts") + //===----------------------------------------------------------------------===// // Ballot builtins. //===----------------------------------------------------------------------===// @@ -254,7 +257,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts") -TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "gfx940-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts") //===----------------------------------------------------------------------===// // Deep learning builtins. @@ -297,7 +300,6 @@ TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_h, "V4UiUifV4fV4hV4hV4Ui TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_l, "V4UiWUifV4fV4fV4fV4Ui", "nc", "gfx10-insts") TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, "V4UiWUifV4fV4hV4hV4Ui", "nc", "gfx10-insts") - //===----------------------------------------------------------------------===// // GFX11+ only builtins. //===----------------------------------------------------------------------===// @@ -306,6 +308,8 @@ TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, "V4UiWUifV4fV4hV4hV4 TARGET_BUILTIN(__builtin_amdgcn_permlane64, "UiUi", "nc", "gfx11-insts") TARGET_BUILTIN(__builtin_amdgcn_s_wait_event_export_ready, "v", "n", "gfx11-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn, "V2UiUiUiV4UiIi", "n", "gfx11-insts") + //===----------------------------------------------------------------------===// // WMMA builtins. // Postfix w32 indicates the builtin requires wavefront size of 32. @@ -346,6 +350,11 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr") BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n") BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n") +// These are special FE only builtins intended for forwarding the requirements +// to the ME. +BUILTIN(__builtin_amdgcn_processor_is, "QccC*", "nctu") +BUILTIN(__builtin_amdgcn_is_invocable, "Qc", "nctu") + //===----------------------------------------------------------------------===// // R600-NI only builtins. //===----------------------------------------------------------------------===// @@ -495,6 +504,16 @@ TARGET_BUILTIN(__builtin_amdgcn_s_get_named_barrier_state, "Uiv*", "n", "gfx12-i TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_buffer_prefetch_data, "vQbIiUi", "nc", "gfx12-insts") +// For the following two builtins, the second and third return values of the +// intrinsics are returned through the last two pointer-type function arguments. +TARGET_BUILTIN(__builtin_amdgcn_image_bvh8_intersect_ray, "V10UiWUifUcV3fV3fUiV4UiV3f*V3f*", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_bvh_dual_intersect_ray, "V10UiWUifUcV3fV3fV2UiV4UiV3f*V3f*", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn, "V2UiUiUiV8UiIi", "n", "gfx12-insts") + +// The intrinsic returns {i64, i32}, the builtin returns <2 x i64>. +// The second return value of the intrinsic is zext'ed. +TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn, "V2WUiUiUiV8UiIi", "n", "gfx12-insts") + TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32") diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 04d70c3b796e9..9d4ea2aa5546b 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11695,9 +11695,9 @@ def err_omp_inscan_reduction_expected : Error< def note_omp_previous_inscan_reduction : Note< "'reduction' clause with 'inscan' modifier is used here">; def err_omp_multivar_xteam_scan_unsupported : Error< - "multiple list items are not yet supported with the 'inclusive' or the 'exclusive' clauses that appear with the 'scan' directive">; + "multiple list items are not yet supported with the 'inclusive' or the 'exclusive' clauses that appear with the 'scan' directive">; def err_omp_xteam_scan_prohibited : Error< - "'scan' directive is not supported inside target regions. Use flag '-fopenmp-target-xteam-scan' to enable it">; + "'scan' directive is not supported inside target regions. Use flag '-fopenmp-target-xteam-scan' to enable it">; def err_omp_expected_predefined_allocator : Error< "expected one of the predefined allocators for the variables with the static " "storage: 'omp_default_mem_alloc', 'omp_large_cap_mem_alloc', " @@ -12907,6 +12907,27 @@ def err_acc_update_as_body "statement| switch statement| label statement}0">; // AMDGCN builtins diagnostics -def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">; -def note_amdgcn_global_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">; +def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">; +def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">; +def err_amdgcn_processor_is_arg_not_literal + : Error<"the argument to __builtin_amdgcn_processor_is must be a string " + "literal">; +def err_amdgcn_processor_is_arg_invalid_value + : Error<"the argument to __builtin_amdgcn_processor_is must be a valid " + "AMDGCN processor identifier; '%0' is not valid">; +def note_amdgcn_processor_is_valid_options + : Note<"valid AMDGCN processor identifiers are: %0">; +def err_amdgcn_is_invocable_arg_invalid_value + : Error<"the argument to __builtin_amdgcn_is_invocable must be either a " + "target agnostic builtin or an AMDGCN target specific builtin; `%0`" + " is not valid">; +def err_amdgcn_predicate_type_is_not_constructible + : Error<"%0 has type __amdgpu_feature_predicate_t, which is not" + " constructible">; +def err_amdgcn_predicate_type_needs_explicit_bool_cast + : Error<"%0 must be explicitly cast to %1; however, please note that this " + "is almost always an error and that it prevents the effective " + "guarding of target dependent code, and thus should be avoided">; +def note_amdgcn_protected_by_predicate : Note<"jump enters statement controlled" + " by AMDGPU feature predicate">; } // end of sema component. diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h index cd586201e79c0..059a1c257dbc4 100644 --- a/clang/include/clang/Driver/Driver.h +++ b/clang/include/clang/Driver/Driver.h @@ -903,7 +903,7 @@ llvm::Error expandResponseFiles(SmallVectorImpl &Args, /// See applyOneOverrideOption. void applyOverrideOptions(SmallVectorImpl &Args, const char *OverrideOpts, - llvm::StringSet<> &SavedStrings, + llvm::StringSet<> &SavedStrings, StringRef EnvVar, raw_ostream *OS = nullptr); } // end namespace driver diff --git a/clang/include/clang/Driver/OffloadBundler.h b/clang/include/clang/Driver/OffloadBundler.h index 31c11e25ecd9f..2c49fbf408e0e 100644 --- a/clang/include/clang/Driver/OffloadBundler.h +++ b/clang/include/clang/Driver/OffloadBundler.h @@ -118,6 +118,7 @@ class CompressedOffloadBundle { static inline const size_t UncompressedSizeFieldSizeV3 = sizeof(uint64_t); static inline const size_t HashFieldSize = sizeof(uint64_t); +public: // Keep V1 header size for backward compatibility static inline const size_t V1HeaderSize = MagicSize + VersionFieldSize + MethodFieldSize + @@ -135,8 +136,7 @@ class CompressedOffloadBundle { static inline const llvm::StringRef MagicNumber = "CCOB"; -public: - static inline const uint16_t DefaultVersion = 2; + static inline const uint16_t DefaultVersion = 3; // Helper method to get header size based on version static size_t getHeaderSize(uint16_t Version) { @@ -158,6 +158,11 @@ class CompressedOffloadBundle { static llvm::Expected> decompress(const llvm::MemoryBuffer &Input, bool Verbose = false); }; + +/// Check whether the bundle id is in the following format: +/// -[-[:target features]] +/// := --- +bool checkOffloadBundleID(const llvm::StringRef Str); } // namespace clang #endif // LLVM_CLANG_DRIVER_OFFLOADBUNDLER_H diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 629bd931848dc..bc9ea6974e1c9 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1056,6 +1056,7 @@ def Xopenmp_target_EQ : JoinedAndSeparate<["-"], "Xopenmp-target=">, Group to the target offloading toolchain identified by .">, MetaVarName<" ">; def z : Separate<["-"], "z">, Flags<[LinkerInput]>, + Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, HelpText<"Pass -z to the linker">, MetaVarName<"">, Group; def offload_link : Flag<["--"], "offload-link">, Group, @@ -1643,11 +1644,15 @@ defm xl_pragma_pack : BoolFOption<"xl-pragma-pack", "Enable IBM XL #pragma pack handling">, NegFlag>; def shared_libsan : Flag<["-"], "shared-libsan">, + Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, HelpText<"Dynamically link the sanitizer runtime">; def static_libsan : Flag<["-"], "static-libsan">, + Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, HelpText<"Statically link the sanitizer runtime (Not supported for ASan, TSan or UBSan on darwin)">; -def : Flag<["-"], "shared-libasan">, Alias; -def : Flag<["-"], "static-libasan">, Alias; +def : Flag<["-"], "shared-libasan">, Alias, + Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; +def : Flag<["-"], "static-libasan">, Alias, + Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; def fasm : Flag<["-"], "fasm">, Group; defm assume_unique_vtables : BoolFOption<"assume-unique-vtables", @@ -2374,7 +2379,7 @@ def fmemory_profile_use_EQ : Joined<["-"], "fmemory-profile-use=">, // Begin sanitizer flags. These should all be core options exposed in all driver // modes. -let Visibility = [ClangOption, CC1Option, CLOption] in { +let Visibility = [ClangOption, CC1Option, CLOption, FlangOption, FC1Option] in { def fsanitize_EQ : CommaJoined<["-"], "fsanitize=">, Group, MetaVarName<"">, @@ -3096,6 +3101,8 @@ defm fat_lto_objects : BoolFOption<"fat-lto-objects", PosFlag, NegFlag, BothFlags<[], [ClangOption, CC1Option], " fat LTO object support">>; +def flto_partitions_EQ : Joined<["-"], "flto-partitions=">, Group, + HelpText<"Number of partitions to use for parallel full LTO codegen, ld.lld only.">; def fmacro_backtrace_limit_EQ : Joined<["-"], "fmacro-backtrace-limit=">, Group, Visibility<[ClangOption, CC1Option, CLOption]>, HelpText<"Set the maximum number of entries to print in a macro expansion backtrace (0 = no limit)">, @@ -4742,8 +4749,10 @@ def gmodules : Flag <["-"], "gmodules">, Group, " or precompiled headers">; def gno_modules : Flag <["-"], "gno-modules">, Group; def gz_EQ : Joined<["-"], "gz=">, Group, + Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, HelpText<"DWARF debug sections compression type">; -def gz : Flag<["-"], "gz">, Alias, AliasArgs<["zlib"]>, Group; +def gz : Flag<["-"], "gz">, Alias, AliasArgs<["zlib"]>, Group, + Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; def gembed_source : Flag<["-"], "gembed-source">, Group, Visibility<[ClangOption, CC1Option]>, HelpText<"Embed source text in DWARF debug sections">, diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h index 36ddb9cd037ee..7e62593de5964 100644 --- a/clang/include/clang/Driver/ToolChain.h +++ b/clang/include/clang/Driver/ToolChain.h @@ -180,7 +180,6 @@ class ToolChain { Tool *getLinkerWrapper() const; mutable bool SanitizerArgsChecked = false; - mutable std::unique_ptr XRayArguments; /// The effective clang triple for the current Job. mutable llvm::Triple EffectiveTriple; @@ -322,7 +321,7 @@ class ToolChain { SanitizerArgs getSanitizerArgs(const llvm::opt::ArgList &JobArgs) const; - const XRayArgs& getXRayArgs() const; + const XRayArgs getXRayArgs(const llvm::opt::ArgList &) const; // Returns the Arg * that explicitly turned on/off rtti, or nullptr. const llvm::opt::Arg *getRTTIArg() const { return CachedRTTIArg; } diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h index 958d65055fa9b..6bf9ae8d074fb 100644 --- a/clang/include/clang/Sema/ScopeInfo.h +++ b/clang/include/clang/Sema/ScopeInfo.h @@ -949,6 +949,9 @@ class LambdaScopeInfo final : SourceLocation PotentialThisCaptureLocation; + /// Variables that are potentially ODR-used in CUDA/HIP. + llvm::SmallPtrSet CUDAPotentialODRUsedVars; + LambdaScopeInfo(DiagnosticsEngine &Diag) : CapturingScopeInfo(Diag, ImpCap_None) { Kind = SK_Lambda; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index eb82d1b978e94..6ae5a1e8065da 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11280,14 +11280,16 @@ class Sema final : public SemaBase { /// The context in which we are checking a template parameter list. enum TemplateParamListContext { - TPC_ClassTemplate, - TPC_VarTemplate, + // For this context, Class, Variable, TypeAlias, and non-pack Template + // Template Parameters are treated uniformly. + TPC_Other, + TPC_FunctionTemplate, TPC_ClassTemplateMember, TPC_FriendClassTemplate, TPC_FriendFunctionTemplate, TPC_FriendFunctionTemplateDefinition, - TPC_TypeAliasTemplate + TPC_TemplateTemplateParameterPack, }; /// Checks the validity of a template parameter list, possibly diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h index d62c9bb65fadb..f72e1c53d2c92 100644 --- a/clang/include/clang/Sema/SemaAMDGPU.h +++ b/clang/include/clang/Sema/SemaAMDGPU.h @@ -15,12 +15,16 @@ #include "clang/AST/ASTFwd.h" #include "clang/Sema/SemaBase.h" +#include "llvm/ADT/SmallPtrSet.h" namespace clang { class AttributeCommonInfo; +class Expr; class ParsedAttr; class SemaAMDGPU : public SemaBase { + llvm::SmallPtrSet ExpandedPredicates; + public: SemaAMDGPU(Sema &S); @@ -64,6 +68,11 @@ class SemaAMDGPU : public SemaBase { void handleAMDGPUNumVGPRAttr(Decl *D, const ParsedAttr &AL); void handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, const ParsedAttr &AL); void handleAMDGPUFlatWorkGroupSizeAttr(Decl *D, const ParsedAttr &AL); + + /// Expand a valid use of the feature identification builtins into its + /// corresponding sequence of instructions. + Expr *ExpandAMDGPUPredicateBI(CallExpr *CE); + bool IsPredicate(Expr *E) const; }; } // namespace clang diff --git a/clang/include/clang/Sema/SemaCUDA.h b/clang/include/clang/Sema/SemaCUDA.h index 71f05e88fb539..dbc1432860d89 100644 --- a/clang/include/clang/Sema/SemaCUDA.h +++ b/clang/include/clang/Sema/SemaCUDA.h @@ -274,6 +274,10 @@ class SemaCUDA : public SemaBase { /// parameters specified via <<<>>>. std::string getConfigureFuncName() const; + /// Record variables that are potentially ODR-used in CUDA/HIP. + void recordPotentialODRUsedVariable(MultiExprArg Args, + OverloadCandidateSet &CandidateSet); + private: unsigned ForceHostDeviceDepth = 0; diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index cd1bcb3b9a063..e11f7a43cebe9 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -1467,7 +1467,12 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target, } if (Target.getTriple().isAMDGPU() || - (AuxTarget && AuxTarget->getTriple().isAMDGPU())) { + (Target.getTriple().isSPIRV() && + Target.getTriple().getVendor() == llvm::Triple::AMD) || + (AuxTarget && + (AuxTarget->getTriple().isAMDGPU() || + ((AuxTarget->getTriple().isSPIRV() && + AuxTarget->getTriple().getVendor() == llvm::Triple::AMD))))) { #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) \ InitBuiltinType(SingletonId, BuiltinType::Id); #include "clang/Basic/AMDGPUTypes.def" @@ -12313,6 +12318,10 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context, Type = Context.AMDGPUBufferRsrcTy; break; } + case 'c': { + Type = Context.AMDGPUFeaturePredicateTy; + break; + } default: llvm_unreachable("Unexpected target builtin type"); } diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index 9ea366af56a52..daec1229e2eaf 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -253,7 +253,7 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple, MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64; CUMode = !(GPUFeatures & llvm::AMDGPU::FEATURE_WGP); - for (auto F : {"image-insts", "gws"}) + for (auto F : {"image-insts", "gws", "vmem-to-lds-load-insts"}) ReadOnlyFeatures.insert(F); HalfArgsAndReturns = true; } diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp index f242fedc1ad66..27123c7bcc6dd 100644 --- a/clang/lib/Basic/Targets/SPIR.cpp +++ b/clang/lib/Basic/Targets/SPIR.cpp @@ -142,3 +142,12 @@ void SPIRV64AMDGCNTargetInfo::setAuxTarget(const TargetInfo *Aux) { Float128Format = DoubleFormat; } } + +bool SPIRV64AMDGCNTargetInfo::isValidCPUName(StringRef CPU) const { + return AMDGPUTI.isValidCPUName(CPU); +} + +void SPIRV64AMDGCNTargetInfo::fillValidCPUList( + SmallVectorImpl &Values) const { + return AMDGPUTI.fillValidCPUList(Values); +} diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index c0849b69dcdb3..468cf7490c18b 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -422,6 +422,11 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final } bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); } + + // This is only needed for validating arguments passed to + // __builtin_amdgcn_processor_is + bool isValidCPUName(StringRef Name) const override; + void fillValidCPUList(SmallVectorImpl &Values) const override; }; } // namespace targets diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 3e65eeb3755d2..b30ee1cea6245 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1099,6 +1099,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline( if (CodeGenOpts.LinkBitcodePostopt) MPM.addPass(LinkInModulesPass(BC)); + if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice && + LangOpts.HIPStdParInterposeAlloc) + MPM.addPass(HipStdParAllocationInterpositionPass()); + // Add a verifier pass if requested. We don't have to do this if the action // requires code generation because there will already be a verifier pass in // the code-generation pipeline. @@ -1162,10 +1166,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline( return; } - if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice && - LangOpts.HIPStdParInterposeAlloc) - MPM.addPass(HipStdParAllocationInterpositionPass()); - // Now that we have all of the passes ready, run them. { PrettyStackTraceString CrashInfo("Optimizer"); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 32119abd97f25..ec6fecead9138 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -76,6 +76,33 @@ using namespace clang; using namespace CodeGen; using namespace llvm; +/// Some builtins do not have library implementation on some targets and +/// are instead emitted as LLVM IRs by some target builtin emitters. +/// FIXME: Remove this when library support is added +static bool shouldEmitBuiltinAsIR(unsigned BuiltinID, + const Builtin::Context &BI, + const CodeGenFunction &CGF) { + if (!CGF.CGM.getLangOpts().MathErrno && + CGF.CurFPFeatures.getExceptionMode() == + LangOptions::FPExceptionModeKind::FPE_Ignore && + !CGF.CGM.getTargetCodeGenInfo().supportsLibCall()) { + switch (BuiltinID) { + default: + return false; + case Builtin::BIlogbf: + case Builtin::BI__builtin_logbf: + case Builtin::BIlogb: + case Builtin::BI__builtin_logb: + case Builtin::BIscalbnf: + case Builtin::BI__builtin_scalbnf: + case Builtin::BIscalbn: + case Builtin::BI__builtin_scalbn: + return true; + } + } + return false; +} + static void initializeAlloca(CodeGenFunction &CGF, AllocaInst *AI, Value *Size, Align AlignmentInBytes) { ConstantInt *Byte; @@ -359,9 +386,10 @@ static Value *EmitFromInt(CodeGenFunction &CGF, llvm::Value *V, static Address CheckAtomicAlignment(CodeGenFunction &CGF, const CallExpr *E) { ASTContext &Ctx = CGF.getContext(); Address Ptr = CGF.EmitPointerWithAlignment(E->getArg(0)); + const llvm::DataLayout &DL = CGF.CGM.getDataLayout(); unsigned Bytes = Ptr.getElementType()->isPointerTy() ? Ctx.getTypeSizeInChars(Ctx.VoidPtrTy).getQuantity() - : Ptr.getElementType()->getScalarSizeInBits() / 8; + : DL.getTypeStoreSize(Ptr.getElementType()); unsigned Align = Ptr.getAlignment().getQuantity(); if (Align % Bytes != 0) { DiagnosticsEngine &Diags = CGF.CGM.getDiags(); @@ -2881,7 +2909,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, // disabled. // Math intrinsics are generated only when math-errno is disabled. Any pragmas // or attributes that affect math-errno should prevent or allow math - // intrincs to be generated. Intrinsics are generated: + // intrinsics to be generated. Intrinsics are generated: // 1- In fast math mode, unless math-errno is overriden // via '#pragma float_control(precise, on)', or via an // 'attribute__((optnone))'. @@ -6428,13 +6456,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, // If this is an alias for a lib function (e.g. __builtin_sin), emit // the call using the normal call path, but using the unmangled // version of the function name. - if (getContext().BuiltinInfo.isLibFunction(BuiltinID)) + const auto &BI = getContext().BuiltinInfo; + if (!shouldEmitBuiltinAsIR(BuiltinID, BI, *this) && + BI.isLibFunction(BuiltinID)) return emitLibraryCall(*this, FD, E, CGM.getBuiltinLibFunction(FD, BuiltinID)); // If this is a predefined lib function (e.g. malloc), emit the call // using exactly the normal call path. - if (getContext().BuiltinInfo.isPredefinedLibFunction(BuiltinID)) + if (BI.isPredefinedLibFunction(BuiltinID)) return emitLibraryCall(*this, FD, E, CGM.getRawFunctionPointer(FD)); // Check that a call to a target specific builtin has the correct target @@ -17343,7 +17373,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); } case X86::BI__builtin_ia32_cvtsbf162ss_32: - return Builder.CreateFPExt(Ops[0], Builder.getFloatTy()); + return Builder.CreateFPExt(Ops[0], Builder.getFloatTy()); case X86::BI__builtin_ia32_cvtneps2bf16_256_mask: case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: { @@ -19784,6 +19814,18 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs)); } +static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) { + auto PTy = IntegerType::getInt1Ty(CGF.getLLVMContext()); + + auto *P = cast( + CGF.CGM.getModule().getOrInsertGlobal(Name.str(), PTy)); + P->setConstant(true); + P->setExternallyInitialized(true); + + return CGF.Builder.CreateLoad( + RawAddress(P, PTy, CharUnits::One(), KnownNonNull)); +} + Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; @@ -20085,6 +20127,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Value *Env = EmitScalarExpr(E->getArg(0)); return Builder.CreateCall(F, {Env}); } + case AMDGPU::BI__builtin_amdgcn_processor_is: { + assert(CGM.getTriple().isSPIRV() && + "__builtin_amdgcn_processor_is should never reach CodeGen for " + "concrete targets!"); + StringRef Proc = cast(E->getArg(0))->getString(); + return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc); + } + case AMDGPU::BI__builtin_amdgcn_is_invocable: { + assert(CGM.getTriple().isSPIRV() && + "__builtin_amdgcn_is_invocable should never reach CodeGen for " + "concrete targets!"); + auto *FD = cast( + cast(E->getArg(0))->getReferencedDeclOfCallee()); + StringRef RF = + getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); + return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF); + } case AMDGPU::BI__builtin_amdgcn_read_exec: return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false); case AMDGPU::BI__builtin_amdgcn_read_exec_lo: @@ -20116,19 +20175,81 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir, RayInverseDir, TextureDescr}); } + case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray: + case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray: + IID = Intrinsic::amdgcn_image_bvh8_intersect_ray; + break; + case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: + IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray; + break; + } + llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0)); + llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1)); + llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2)); + llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3)); + llvm::Value *RayDir = EmitScalarExpr(E->getArg(4)); + llvm::Value *Offset = EmitScalarExpr(E->getArg(5)); + llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6)); + + Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7)); + Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8)); + + llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID); + + llvm::CallInst *CI = Builder.CreateCall( + IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir, + Offset, TextureDescr}); + + llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0); + llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1); + llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2); + + Builder.CreateStore(RetRayOrigin, RetRayOriginPtr); + Builder.CreateStore(RetRayDir, RetRayDirPtr); + + return RetVData; + } + + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn: + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn: + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: + IID = Intrinsic::amdgcn_ds_bvh_stack_rtn; + break; + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn: + IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn; + break; + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn: + IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn; + break; + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: + IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn; + break; + } - case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: { SmallVector Args; for (int i = 0, e = E->getNumArgs(); i != e; ++i) Args.push_back(EmitScalarExpr(E->getArg(i))); - Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ds_bvh_stack_rtn); + Function *F = CGM.getIntrinsic(IID); Value *Call = Builder.CreateCall(F, Args); Value *Rtn = Builder.CreateExtractValue(Call, 0); Value *A = Builder.CreateExtractValue(Call, 1); llvm::Type *RetTy = ConvertType(E->getType()); Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn, (uint64_t)0); + // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns + // <2 x i64>, zext the second value. + if (A->getType()->getPrimitiveSizeInBits() < + RetTy->getScalarType()->getPrimitiveSizeInBits()) + A = Builder.CreateZExt(A, RetTy->getScalarType()); + return Builder.CreateInsertElement(I0, A, 1); } case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4: @@ -20632,6 +20753,57 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_s_prefetch_data: return emitBuiltinWithOneOverloadedType<2>( *this, E, Intrinsic::amdgcn_s_prefetch_data); + case Builtin::BIlogbf: + case Builtin::BI__builtin_logbf: { + Value *Src0 = EmitScalarExpr(E->getArg(0)); + Function *FrExpFunc = CGM.getIntrinsic( + Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()}); + CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0); + Value *Exp = Builder.CreateExtractValue(FrExp, 1); + Value *Add = Builder.CreateAdd( + Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true); + Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getFloatTy()); + Value *Fabs = + emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs); + Value *FCmpONE = Builder.CreateFCmpONE( + Fabs, ConstantFP::getInfinity(Builder.getFloatTy())); + Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs); + Value *FCmpOEQ = + Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getFloatTy())); + Value *Sel2 = Builder.CreateSelect( + FCmpOEQ, + ConstantFP::getInfinity(Builder.getFloatTy(), /*Negative=*/true), Sel1); + return Sel2; + } + case Builtin::BIlogb: + case Builtin::BI__builtin_logb: { + Value *Src0 = EmitScalarExpr(E->getArg(0)); + Function *FrExpFunc = CGM.getIntrinsic( + Intrinsic::frexp, {Src0->getType(), Builder.getInt32Ty()}); + CallInst *FrExp = Builder.CreateCall(FrExpFunc, Src0); + Value *Exp = Builder.CreateExtractValue(FrExp, 1); + Value *Add = Builder.CreateAdd( + Exp, ConstantInt::getSigned(Exp->getType(), -1), "", false, true); + Value *SIToFP = Builder.CreateSIToFP(Add, Builder.getDoubleTy()); + Value *Fabs = + emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::fabs); + Value *FCmpONE = Builder.CreateFCmpONE( + Fabs, ConstantFP::getInfinity(Builder.getDoubleTy())); + Value *Sel1 = Builder.CreateSelect(FCmpONE, SIToFP, Fabs); + Value *FCmpOEQ = + Builder.CreateFCmpOEQ(Src0, ConstantFP::getZero(Builder.getDoubleTy())); + Value *Sel2 = Builder.CreateSelect( + FCmpOEQ, + ConstantFP::getInfinity(Builder.getDoubleTy(), /*Negative=*/true), + Sel1); + return Sel2; + } + case Builtin::BIscalbnf: + case Builtin::BI__builtin_scalbnf: + case Builtin::BIscalbn: + case Builtin::BI__builtin_scalbn: + return emitBinaryExpMaybeConstrainedFPBuiltin( + *this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp); default: return nullptr; } diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 84fcad0aa72bb..210ce28c954b9 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -969,6 +969,13 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) { DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_unsigned); \ return SingletonId; \ } +#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \ + case BuiltinType::Id: { \ + if (!SingletonId) \ + SingletonId = \ + DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_boolean); \ + return SingletonId; \ + } #include "clang/Basic/AMDGPUTypes.def" case BuiltinType::UChar: case BuiltinType::Char_U: diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index df850421c72c6..a349a1fe307a7 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -974,6 +974,10 @@ Value *ScalarExprEmitter::EmitConversionToBool(Value *Src, QualType SrcType) { if (const MemberPointerType *MPT = dyn_cast(SrcType)) return CGF.CGM.getCXXABI().EmitMemberPointerIsNotNull(CGF, Src, MPT); + // The conversion is a NOP, and will be done when CodeGening the builtin. + if (SrcType == CGF.getContext().AMDGPUFeaturePredicateTy) + return Src; + assert((SrcType->isIntegerType() || isa(Src->getType())) && "Unknown scalar type to convert"); diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 68c9e66517653..1dfcf640311fd 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -9720,6 +9720,17 @@ static void emitTargetCallKernelLaunch( ++RedVarCount; } } + // Process debug info. + if (CGF.CGM.getCodeGenOpts().getDebugInfo() != + llvm::codegenoptions::NoDebugInfo) { + auto FillInfoMap = [&](MappableExprsHandler::MappingExprInfo &MapExpr) { + return emitMappingInformation(CGF, OMPBuilder, MapExpr); + }; + + CombinedInfo.Names.resize(CombinedInfo.Exprs.size()); + llvm::transform(CombinedInfo.Exprs, CombinedInfo.Names.begin(), + FillInfoMap); + } } CGOpenMPRuntime::TargetDataInfo Info; diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 1a097608e81ed..237e76ed14fbb 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -981,10 +981,12 @@ bool CodeGenFunction::EmitXteamRedStmt(const Stmt *S) { } assert(RedRHSExpr != nullptr && "Did not find a valid reduction rhs"); llvm::Value *RHSValue = EmitScalarExpr(RedRHSExpr); - Address XteamRedLocalAddr = RedVarMap.find(RedVarDecl)->second.RedVarAddr; + auto It = RedVarMap.find(RedVarDecl); + assert(It != RedVarMap.end() && "Variable must be found in reduction map"); + Address XteamRedLocalAddr = It->second.RedVarAddr; // Compute *xteam_red_local_addr + rhs_value llvm::Value *RedRHS = nullptr; - llvm::Type *RedVarType = ConvertTypeForMem(RedVarDecl->getType()); + llvm::Type *RedVarType = ConvertTypeForMem(It->second.RedVarExpr->getType()); if (RedVarType->isFloatTy() || RedVarType->isDoubleTy() || RedVarType->isHalfTy() || RedVarType->isBFloatTy()) { auto RHSOp = RHSValue->getType()->isIntegerTy() diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp index 7faf6821a6cdc..c9108938bca50 100644 --- a/clang/lib/CodeGen/CGVTables.cpp +++ b/clang/lib/CodeGen/CGVTables.cpp @@ -771,6 +771,10 @@ void CodeGenVTables::addVTableComponent(ConstantArrayBuilder &builder, case VTableComponent::CK_DeletingDtorPointer: { GlobalDecl GD = component.getGlobalDecl(); + const bool IsThunk = + nextVTableThunkIndex < layout.vtable_thunks().size() && + layout.vtable_thunks()[nextVTableThunkIndex].first == componentIndex; + if (CGM.getLangOpts().CUDA) { // Emit NULL for methods we can't codegen on this // side. Otherwise we'd end up with vtable with unresolved @@ -782,9 +786,12 @@ void CodeGenVTables::addVTableComponent(ConstantArrayBuilder &builder, CGM.getLangOpts().CUDAIsDevice ? MD->hasAttr() : (MD->hasAttr() || !MD->hasAttr()); - if (!CanEmitMethod) + if (!CanEmitMethod) { + if (IsThunk) + nextVTableThunkIndex++; return builder.add( llvm::ConstantExpr::getNullValue(CGM.GlobalsInt8PtrTy)); + } // Method is acceptable, continue processing as usual. } @@ -830,9 +837,7 @@ void CodeGenVTables::addVTableComponent(ConstantArrayBuilder &builder, fnPtr = DeletedVirtualFn; // Thunks. - } else if (nextVTableThunkIndex < layout.vtable_thunks().size() && - layout.vtable_thunks()[nextVTableThunkIndex].first == - componentIndex) { + } else if (IsThunk) { auto &thunkInfo = layout.vtable_thunks()[nextVTableThunkIndex].second; nextVTableThunkIndex++; diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 2959da0b5c589..09015282d5575 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -3072,13 +3072,15 @@ static void emitUsed(CodeGenModule &CGM, StringRef Name, for (unsigned i = 0, e = List.size(); i != e; ++i) { UsedArray[i] = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( cast(&*List[i]), - llvm::PointerType::getUnqual(CGM.getLLVMContext())); + CGM.getTarget().getTriple().isAMDGCN() ? + llvm::PointerType::getUnqual(CGM.getLLVMContext()) : + CGM.Int8PtrTy); } if (UsedArray.empty()) return; - llvm::ArrayType *ATy = llvm::ArrayType::get( - llvm::PointerType::getUnqual(CGM.getLLVMContext()), UsedArray.size()); + llvm::ArrayType *ATy = llvm::ArrayType::get(UsedArray.front()->getType(), + UsedArray.size()); auto *GV = new llvm::GlobalVariable( CGM.getModule(), ATy, false, llvm::GlobalValue::AppendingLinkage, diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index 405242e97e75c..3047a52a30f49 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -574,6 +574,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { case BuiltinType::Id: \ return llvm::TargetExtType::get(getLLVMContext(), "amdgcn.named.barrier", \ {}, {Scope}); +#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \ + case BuiltinType::Id: \ + return ConvertType(getContext().getLogicalOperationType()); #include "clang/Basic/AMDGPUTypes.def" #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id: #include "clang/Basic/HLSLIntangibleTypes.def" diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h index 4a66683a3b91f..21f7cde5a960c 100644 --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -71,6 +71,10 @@ class TargetCodeGenInfo { return *SwiftInfo; } + /// supportsLibCall - Query to whether or not target supports all + /// lib calls. + virtual bool supportsLibCall() const { return true; } + /// setTargetAttributes - Provides a convenient hook to handle extra /// target-specific attributes for the given global. virtual void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 2c6f745a84abe..dcebf51ce6132 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -301,6 +301,7 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo { AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT) : TargetCodeGenInfo(std::make_unique(CGT)) {} + bool supportsLibCall() const override { return false; } void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, CodeGenModule &CGM) const; diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 3a1d3deb4954f..38cad6017562a 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -3726,9 +3726,12 @@ class OffloadingActionBuilder final { // compiler phases, including backend and assemble phases. ActionList AL; Action *BackendAction = nullptr; - if (ToolChains.front()->getTriple().isSPIRV()) { + if (ToolChains.front()->getTriple().isSPIRV() || + (ToolChains.front()->getTriple().isAMDGCN() && + GpuArchList[I] == StringRef("amdgcnspirv"))) { // Emit LLVM bitcode for SPIR-V targets. SPIR-V device tool chain - // (HIPSPVToolChain) runs post-link LLVM IR passes. + // (HIPSPVToolChain or HIPAMDToolChain) runs post-link LLVM IR + // passes. types::ID Output = Args.hasArg(options::OPT_S) ? types::TY_LLVM_IR : types::TY_LLVM_BC; @@ -7136,9 +7139,10 @@ static const char *GetStableCStr(llvm::StringSet<> &SavedStrings, StringRef S) { /// /// '#': Silence information about the changes to the command line arguments. /// -/// '^': Add FOO as a new argument at the beginning of the command line. +/// '^FOO': Add FOO as a new argument at the beginning of the command line +/// right after the name of the compiler executable. /// -/// '+': Add FOO as a new argument at the end of the command line. +/// '+FOO': Add FOO as a new argument at the end of the command line. /// /// 's/XXX/YYY/': Substitute the regular expression XXX with YYY in the command /// line. @@ -7226,7 +7230,7 @@ static void applyOneOverrideOption(raw_ostream &OS, void driver::applyOverrideOptions(SmallVectorImpl &Args, const char *OverrideStr, llvm::StringSet<> &SavedStrings, - raw_ostream *OS) { + StringRef EnvVar, raw_ostream *OS) { if (!OS) OS = &llvm::nulls(); @@ -7235,7 +7239,7 @@ void driver::applyOverrideOptions(SmallVectorImpl &Args, OS = &llvm::nulls(); } - *OS << "### CCC_OVERRIDE_OPTIONS: " << OverrideStr << "\n"; + *OS << "### " << EnvVar << ": " << OverrideStr << "\n"; // This does not need to be efficient. diff --git a/clang/lib/Driver/OffloadBundler.cpp b/clang/lib/Driver/OffloadBundler.cpp index 12d763e5c65b6..6dfb72fb223ce 100644 --- a/clang/lib/Driver/OffloadBundler.cpp +++ b/clang/lib/Driver/OffloadBundler.cpp @@ -83,32 +83,27 @@ OffloadTargetInfo::OffloadTargetInfo(const StringRef Target, const OffloadBundlerConfig &BC) : BundlerConfig(BC) { - // TODO: Add error checking from ClangOffloadBundler.cpp - auto TargetFeatures = Target.split(':'); - auto TripleOrGPU = TargetFeatures.first.rsplit('-'); - - if (clang::StringToOffloadArch(TripleOrGPU.second) != - clang::OffloadArch::UNKNOWN) { - auto KindTriple = TripleOrGPU.first.split('-'); - this->OffloadKind = KindTriple.first; - - // Enforce optional env field to standardize bundles - llvm::Triple t = llvm::Triple(KindTriple.second); - this->Triple = llvm::Triple(t.getArchName(), t.getVendorName(), - t.getOSName(), t.getEnvironmentName()); - - this->TargetID = Target.substr(Target.find(TripleOrGPU.second)); - } else { - auto KindTriple = TargetFeatures.first.split('-'); - this->OffloadKind = KindTriple.first; - - // Enforce optional env field to standardize bundles - llvm::Triple t = llvm::Triple(KindTriple.second); - this->Triple = llvm::Triple(t.getArchName(), t.getVendorName(), - t.getOSName(), t.getEnvironmentName()); - + // -[-[:target features]] + // := --- + SmallVector Components; + Target.split(Components, '-', /*MaxSplit=*/5); + assert((Components.size() == 5 || Components.size() == 6) && + "malformed target string"); + + StringRef TargetIdWithFeature = + Components.size() == 6 ? Components.back() : ""; + StringRef TargetId = TargetIdWithFeature.split(':').first; + if (!TargetId.empty() && + clang::StringToOffloadArch(TargetId) != clang::OffloadArch::UNKNOWN) + this->TargetID = TargetIdWithFeature; + else this->TargetID = ""; - } + + this->OffloadKind = Components.front(); + ArrayRef TripleSlice{&Components[1], /*length=*/4}; + llvm::Triple T = llvm::Triple(llvm::join(TripleSlice, "-")); + this->Triple = llvm::Triple(T.getArchName(), T.getVendorName(), T.getOSName(), + T.getEnvironmentName()); } bool OffloadTargetInfo::hasHostKind() const { @@ -148,7 +143,18 @@ bool OffloadTargetInfo::operator==(const OffloadTargetInfo &Target) const { } std::string OffloadTargetInfo::str() const { - return Twine(OffloadKind + "-" + Triple.str() + "-" + TargetID).str(); + std::string NormalizedTriple; + // Unfortunately we need some special sauce for AMDGPU because all the runtime + // assumes the triple to be "amdgcn-amd-amdhsa-" (empty environment) instead + // of "amdgcn-amd-amdhsa-unknown". It's gonna be very tricky to patch + // different layers of runtime. + if (Triple.isAMDGPU()) { + NormalizedTriple = Triple.normalize(Triple::CanonicalForm::THREE_IDENT); + NormalizedTriple.push_back('-'); + } else { + NormalizedTriple = Triple.normalize(Triple::CanonicalForm::FOUR_IDENT); + } + return Twine(OffloadKind + "-" + NormalizedTriple + "-" + TargetID).str(); } static StringRef getDeviceFileExtension(StringRef Device, @@ -1507,6 +1513,9 @@ Error OffloadBundler::UnbundleFiles() { StringMap Worklist; auto Output = BundlerConfig.OutputFileNames.begin(); for (auto &Triple : BundlerConfig.TargetNames) { + if (!checkOffloadBundleID(Triple)) + return createStringError(errc::invalid_argument, + "invalid bundle id from bundle config"); Worklist[Triple] = *Output; ++Output; } @@ -1526,6 +1535,9 @@ Error OffloadBundler::UnbundleFiles() { StringRef CurTriple = **CurTripleOrErr; assert(!CurTriple.empty()); + if (!checkOffloadBundleID(CurTriple)) + return createStringError(errc::invalid_argument, + "invalid bundle id read from the bundle"); auto Output = Worklist.begin(); for (auto E = Worklist.end(); Output != E; Output++) { @@ -1584,6 +1596,8 @@ Error OffloadBundler::UnbundleFiles() { return createFileError(E.second, EC); // If this entry has a host kind, copy the input file to the output file. + // We don't need to check E.getKey() here through checkOffloadBundleID + // because the entire WorkList has been checked above. auto OffloadInfo = OffloadTargetInfo(E.getKey(), BundlerConfig); if (OffloadInfo.hasHostKind()) OutputFile.write(Input.getBufferStart(), Input.getBufferSize()); @@ -1813,6 +1827,10 @@ Error OffloadBundler::UnbundleArchive() { // archive. while (!CodeObject.empty()) { SmallVector CompatibleTargets; + if (!checkOffloadBundleID(CodeObject)) { + return createStringError(errc::invalid_argument, + "Invalid bundle id read from code object"); + } auto CodeObjectInfo = OffloadTargetInfo(CodeObject, BundlerConfig); if (getCompatibleOffloadTargets(CodeObjectInfo, CompatibleTargets, BundlerConfig)) { @@ -1894,3 +1912,11 @@ Error OffloadBundler::UnbundleArchive() { return Error::success(); } + +bool clang::checkOffloadBundleID(const llvm::StringRef Str) { + // -[-[:target features]] + // := --- + SmallVector Components; + Str.split(Components, '-', /*MaxSplit=*/5); + return Components.size() == 5 || Components.size() == 6; +} diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 7d4818041167d..e463ec097bd55 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -393,10 +393,9 @@ ToolChain::getSanitizerArgs(const llvm::opt::ArgList &JobArgs) const { return SanArgs; } -const XRayArgs& ToolChain::getXRayArgs() const { - if (!XRayArguments) - XRayArguments.reset(new XRayArgs(*this, Args)); - return *XRayArguments; +const XRayArgs ToolChain::getXRayArgs(const llvm::opt::ArgList &JobArgs) const { + XRayArgs XRayArguments(*this, JobArgs); + return XRayArguments; } namespace { @@ -451,10 +450,18 @@ static std::string normalizeProgramName(llvm::StringRef Argv0) { return ProgName; } -static const DriverSuffix *parseDriverSuffix(StringRef ProgName, size_t &Pos) { +static const DriverSuffix *parseDriverSuffix(StringRef ProgName, size_t &Pos, bool &FlangNew) { // Try to infer frontend type and default target from the program name by // comparing it against DriverSuffixes in order. + // Part I: Warn if invocation happens with flang-new (for Flang); this is for + // the time being and should be removed once AMD Classic Flang has been + // removed from ROCm. + FlangNew = false; + if (ProgName.ends_with("flang-new")) { + FlangNew = true; + } + // If there is a match, the function tries to identify a target as prefix. // E.g. "x86_64-linux-clang" as interpreted as suffix "clang" with target // prefix "x86_64-linux". If such a target prefix is found, it may be @@ -488,7 +495,23 @@ ParsedClangName ToolChain::getTargetAndModeFromProgramName(StringRef PN) { std::string ProgName = normalizeProgramName(PN); size_t SuffixPos; - const DriverSuffix *DS = parseDriverSuffix(ProgName, SuffixPos); + bool FlangNew = false; + const DriverSuffix *DS = parseDriverSuffix(ProgName, SuffixPos, FlangNew); + + // Part II: Warn if invocation happens with flang-new (for Flang); this is for + // the time being and should be removed once AMD Classic Flang has been + // removed from ROCm. + if (FlangNew) { + // flang-new warning is overwarning, disabling until fixed. + if (false && !::getenv("AMD_NOWARN_FLANG_NEW")) { + // The solution with "llvm::errs()" is not ideal, but the driver object + // is not been constructed yet, so we cannot use the Diag() infrastructure + // for this. + llvm::errs() << "warning: the 'amdflang-new' and 'flang-new' commmands " + "have been deprecated; please use 'amdflang' instead\n"; + } + } + if (!DS) return {}; size_t SuffixEnd = SuffixPos + strlen(DS->Suffix); diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 34eda9855783d..798ea8aad6de6 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -626,39 +626,17 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-shared"); } + if (C.getDriver().isUsingLTO()) { + const bool ThinLTO = (C.getDriver().getLTOMode() == LTOK_Thin); + addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], ThinLTO); + } else if (Args.hasArg(options::OPT_mcpu_EQ)) + CmdArgs.push_back(Args.MakeArgString( + "-plugin-opt=mcpu=" + Args.getLastArgValue(options::OPT_mcpu_EQ))); + addLinkerCompressDebugSectionsOption(getToolChain(), Args, CmdArgs); Args.AddAllArgs(CmdArgs, options::OPT_L); getToolChain().AddFilePathLibArgs(Args, CmdArgs); AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA); - if (C.getDriver().isUsingLTO()) { - addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], - C.getDriver().getLTOMode() == LTOK_Thin); - } else if (Args.hasArg(options::OPT_mcpu_EQ)) { - CmdArgs.push_back(Args.MakeArgString( - "-plugin-opt=mcpu=" + - getProcessorFromTargetID(getToolChain().getTriple(), - Args.getLastArgValue(options::OPT_mcpu_EQ)))); - } - - // Always pass the target-id features to the LTO job. - std::vector Features; - getAMDGPUTargetFeatures(C.getDriver(), getToolChain().getTriple(), Args, - Features); - if (!Features.empty()) { - CmdArgs.push_back( - Args.MakeArgString("-plugin-opt=-mattr=" + llvm::join(Features, ","))); - } - - if (Args.hasArg(options::OPT_stdlib)) - CmdArgs.append({"-lc", "-lm"}); - if (Args.hasArg(options::OPT_startfiles)) { - std::optional IncludePath = getToolChain().getStdlibPath(); - if (!IncludePath) - IncludePath = "/lib"; - SmallString<128> P(*IncludePath); - llvm::sys::path::append(P, "crt1.o"); - CmdArgs.push_back(Args.MakeArgString(P)); - } CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); @@ -667,6 +645,26 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs, Inputs, Output)); } +static unsigned getFullLTOPartitions(const Driver &D, const ArgList &Args) { + int Value = 0; + StringRef A = Args.getLastArgValue(options::OPT_flto_partitions_EQ, "8"); + if (A.getAsInteger(10, Value) || (Value < 1)) { + Arg *Arg = Args.getLastArg(options::OPT_flto_partitions_EQ); + D.Diag(diag::err_drv_invalid_int_value) + << Arg->getAsString(Args) << Arg->getValue(); + return 1; + } + + return Value; +} + +void amdgpu::addFullLTOPartitionOption(const Driver &D, + const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) { + CmdArgs.push_back(Args.MakeArgString("--lto-partitions=" + + Twine(getFullLTOPartitions(D, Args)))); +} + void amdgpu::getAMDGPUTargetFeatures(const Driver &D, const llvm::Triple &Triple, const llvm::opt::ArgList &Args, diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h index 09fbbb327e74b..ed0615dacaaf4 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.h +++ b/clang/lib/Driver/ToolChains/AMDGPU.h @@ -82,6 +82,8 @@ const char *getLldCommandArgs( const std::optional OutputFilePrefix = std::nullopt); } // end namespace dlr +void addFullLTOPartitionOption(const Driver &D, const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs); } // end namespace amdgpu } // end namespace tools diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 5dbc0bd0472c0..692702963c226 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1041,21 +1041,29 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, ArgM = ArgMD; if (ArgM) { - // Determine the output location. - const char *DepFile; - if (Arg *MF = Args.getLastArg(options::OPT_MF)) { - DepFile = MF->getValue(); - C.addFailureResultFile(DepFile, &JA); - } else if (Output.getType() == types::TY_Dependencies) { - DepFile = Output.getFilename(); - } else if (!ArgMD) { - DepFile = "-"; - } else { - DepFile = getDependencyFileName(Args, Inputs); - C.addFailureResultFile(DepFile, &JA); + if (!JA.isDeviceOffloading(Action::OFK_HIP)) { + // Determine the output location. + const char *DepFile; + if (Arg *MF = Args.getLastArg(options::OPT_MF)) { + DepFile = MF->getValue(); + C.addFailureResultFile(DepFile, &JA); + } else if (Output.getType() == types::TY_Dependencies) { + DepFile = Output.getFilename(); + } else if (!ArgMD) { + DepFile = "-"; + } else { + DepFile = getDependencyFileName(Args, Inputs); + C.addFailureResultFile(DepFile, &JA); + } + CmdArgs.push_back("-dependency-file"); + CmdArgs.push_back(DepFile); + } + // Cmake generates dependency files using all compilation options specified + // by users. Claim those not used for dependency files. + if (JA.isOffloading(Action::OFK_HIP)) { + Args.ClaimAllArgs(options::OPT_offload_compress); + Args.ClaimAllArgs(options::OPT_no_offload_compress); } - CmdArgs.push_back("-dependency-file"); - CmdArgs.push_back(DepFile); bool HasTarget = false; for (const Arg *A : Args.filtered(options::OPT_MT, options::OPT_MQ)) { @@ -4980,10 +4988,16 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T, renderDwarfFormat(D, T, Args, CmdArgs, EffectiveDWARFVersion); RenderDebugInfoCompressionArgs(Args, CmdArgs, D, TC); - bool EmitDwarfForAMDGCN = EmitDwarf && T.isAMDGCN(); + bool EmitDwarfForAMDGCN = + EmitDwarf && + (T.isAMDGCN() || (T.isSPIRV() && T.getVendor() == llvm::Triple::AMD)); if (EmitDwarfForAMDGCN) CmdArgs.append({"-mllvm", "-amdgpu-spill-cfi-saved-regs"}); if (Arg *A = Args.getLastArg(options::OPT_gheterogeneous_dwarf_EQ)) { + if (StringRef(A->getValue()) == "diexpr" && T.isSPIRV() && + T.getVendor() == llvm::Triple::AMD) + D.Diag(clang::diag::err_drv_unsupported_opt_with_suggestion) + << A->getAsString(Args) << "-gheterogeneous-dwarf=diexpression"; A->render(Args, CmdArgs); } else if (EmitDwarfForAMDGCN) { #ifndef NDEBUG @@ -7075,7 +7089,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("--offload-new-driver"); } - const XRayArgs &XRay = TC.getXRayArgs(); + const XRayArgs &XRay = TC.getXRayArgs(Args); XRay.addArgs(TC, Args, CmdArgs, InputType); for (const auto &Filename : @@ -9203,7 +9217,8 @@ void OffloadBundler::ConstructJob(Compilation &C, const JobAction &JA, } Triples += Action::GetOffloadKindName(CurKind); Triples += '-'; - Triples += CurTC->getTriple().normalize(); + Triples += + CurTC->getTriple().normalize(llvm::Triple::CanonicalForm::FOUR_IDENT); if ((CurKind == Action::OFK_HIP || CurKind == Action::OFK_Cuda) && !StringRef(CurDep->getOffloadingArch()).empty()) { Triples += '-'; @@ -9306,7 +9321,8 @@ void OffloadBundler::ConstructJobMultipleOutputs( auto OffloadKind = Dep.DependentOffloadKind; Triples += Action::GetOffloadKindName(OffloadKind); Triples += '-'; - Triples += Dep.DependentToolChain->getTriple().normalize(); + Triples += Dep.DependentToolChain->getTriple().normalize( + llvm::Triple::CanonicalForm::FOUR_IDENT); if ((Dep.DependentOffloadKind == Action::OFK_HIP || Dep.DependentOffloadKind == Action::OFK_Cuda) && !Dep.DependentBoundArch.empty()) { diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 1f52dd9228e95..ce73067ce9f61 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -907,6 +907,17 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, // files if (IsFatLTO) CmdArgs.push_back("--fat-lto-objects"); + + if (Args.hasArg(options::OPT_flto_partitions_EQ)) { + int Value = 0; + StringRef A = Args.getLastArgValue(options::OPT_flto_partitions_EQ, "8"); + if (A.getAsInteger(10, Value) || (Value < 1)) { + Arg *Arg = Args.getLastArg(options::OPT_flto_partitions_EQ); + D.Diag(diag::err_drv_invalid_int_value) + << Arg->getAsString(Args) << Arg->getValue(); + } + CmdArgs.push_back(Args.MakeArgString("--lto-partitions=" + A)); + } } const char *PluginOptPrefix = IsOSAIX ? "-bplugin_opt:" : "-plugin-opt="; @@ -1819,17 +1830,18 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, } bool tools::addXRayRuntime(const ToolChain&TC, const ArgList &Args, ArgStringList &CmdArgs) { + const XRayArgs &XRay = TC.getXRayArgs(Args); if (Args.hasArg(options::OPT_shared)) { - if (TC.getXRayArgs().needsXRayDSORt()) { + if (XRay.needsXRayDSORt()) { CmdArgs.push_back("--whole-archive"); CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray-dso")); CmdArgs.push_back("--no-whole-archive"); return true; } - } else if (TC.getXRayArgs().needsXRayRt()) { + } else if (XRay.needsXRayRt()) { CmdArgs.push_back("--whole-archive"); CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray")); - for (const auto &Mode : TC.getXRayArgs().modeList()) + for (const auto &Mode : XRay.modeList()) CmdArgs.push_back(TC.getCompilerRTArgString(Args, Mode)); CmdArgs.push_back("--no-whole-archive"); return true; @@ -2742,7 +2754,8 @@ static void GetSDLFromOffloadArchive( SmallString<128> DeviceTriple; DeviceTriple += Action::GetOffloadKindName(JA.getOffloadingDeviceKind()); DeviceTriple += '-'; - std::string NormalizedTriple = T.getToolChain().getTriple().normalize(); + std::string NormalizedTriple = T.getToolChain().getTriple().normalize( + llvm::Triple::CanonicalForm::FOUR_IDENT); DeviceTriple += NormalizedTriple; if (!Target.empty()) { DeviceTriple += '-'; diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index 55c55bad73934..bf0d02834d7d5 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -1627,7 +1627,7 @@ void DarwinClang::AddLinkRuntimeLibArgs(const ArgList &Args, } } - const XRayArgs &XRay = getXRayArgs(); + const XRayArgs &XRay = getXRayArgs(Args); if (XRay.needsXRayRt()) { AddLinkRuntimeLib(Args, CmdArgs, "xray"); AddLinkRuntimeLib(Args, CmdArgs, "xray-basic"); diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index 34e91c0b20a7e..c94a63e105a3e 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -254,10 +254,11 @@ void HIPAMDToolChain::addClangTargetOptions( CC1Args.append({"-fcuda-is-device", "-fno-threadsafe-statics"}); if (!DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, - false)) + false)) { CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"}); - if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar)) - CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"}); + if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar)) + CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"}); + } StringRef MaxThreadsPerBlock = DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ); @@ -315,6 +316,10 @@ HIPAMDToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, checkTargetID(*DAL); } + if (!Args.hasArg(options::OPT_flto_partitions_EQ)) + DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_flto_partitions_EQ), + "8"); + return DAL; } diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp index dfe9acc1ec795..b4469c05cea3f 100644 --- a/clang/lib/Driver/ToolChains/HIPUtility.cpp +++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp @@ -45,7 +45,7 @@ static std::string normalizeForBundler(const llvm::Triple &T, return HasTargetID ? (T.getArchName() + "-" + T.getVendorName() + "-" + T.getOSName() + "-" + T.getEnvironmentName()) .str() - : T.normalize(); + : T.normalize(llvm::Triple::CanonicalForm::FOUR_IDENT); } // Collect undefined __hip_fatbin* and __hip_gpubin_handle* symbols from all diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index 07468a2cd0a07..ea628647d6acd 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -985,9 +985,11 @@ void Linux::AddHIPRuntimeLibArgs(const ArgList &Args, Args.MakeArgString(StringRef("-L") + RocmInstallation->getLibPath())); if (Args.hasFlag(options::OPT_frtlib_add_rpath, - options::OPT_fno_rtlib_add_rpath, false)) - CmdArgs.append( - {"-rpath", Args.MakeArgString(RocmInstallation->getLibPath())}); + options::OPT_fno_rtlib_add_rpath, false)) { + SmallString<0> p = RocmInstallation->getLibPath(); + llvm::sys::path::remove_dots(p, true); + CmdArgs.append({"-rpath", Args.MakeArgString(p)}); + } CmdArgs.push_back("-lamdhip64"); } diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 17f624e964539..32692760fe799 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -614,8 +614,10 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI, Builder.defineMacro("__HIP_MEMORY_SCOPE_SYSTEM", "5"); if (LangOpts.HIPStdPar) { Builder.defineMacro("__HIPSTDPAR__"); - if (LangOpts.HIPStdParInterposeAlloc) + if (LangOpts.HIPStdParInterposeAlloc) { Builder.defineMacro("__HIPSTDPAR_INTERPOSE_ALLOC__"); + Builder.defineMacro("__HIPSTDPAR_INTERPOSE_ALLOC_V1__"); + } } if (LangOpts.CUDAIsDevice) { Builder.defineMacro("__HIP_DEVICE_COMPILE__"); diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 6d9e0c8e1fded..2e911aa41c7c5 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -333,6 +333,7 @@ set(cuda_wrapper_files ) set(cuda_wrapper_bits_files + cuda_wrappers/bits/c++config.h cuda_wrappers/bits/shared_ptr_base.h cuda_wrappers/bits/basic_string.h cuda_wrappers/bits/basic_string.tcc diff --git a/clang/lib/Headers/__clang_hip_libdevice_declares.h b/clang/lib/Headers/__clang_hip_libdevice_declares.h index f15198b3d9f93..fa8d918248dd0 100644 --- a/clang/lib/Headers/__clang_hip_libdevice_declares.h +++ b/clang/lib/Headers/__clang_hip_libdevice_declares.h @@ -14,6 +14,8 @@ #include "hip/hip_version.h" #endif // __has_include("hip/hip_version.h") +#define __PRIVATE_AS __attribute__((opencl_private)) + #ifdef __cplusplus extern "C" { #endif @@ -55,8 +57,7 @@ __device__ __attribute__((const)) float __ocml_fmax_f32(float, float); __device__ __attribute__((const)) float __ocml_fmin_f32(float, float); __device__ __attribute__((const)) __device__ float __ocml_fmod_f32(float, float); -__device__ float __ocml_frexp_f32(float, - __attribute__((address_space(5))) int *); +__device__ float __ocml_frexp_f32(float, __PRIVATE_AS int *); __device__ __attribute__((const)) float __ocml_hypot_f32(float, float); __device__ __attribute__((const)) int __ocml_ilogb_f32(float); __device__ __attribute__((const)) int __ocml_isfinite_f32(float); @@ -74,8 +75,7 @@ __device__ __attribute__((pure)) float __ocml_native_log2_f32(float); __device__ __attribute__((const)) float __ocml_logb_f32(float); __device__ __attribute__((pure)) float __ocml_log_f32(float); __device__ __attribute__((pure)) float __ocml_native_log_f32(float); -__device__ float __ocml_modf_f32(float, - __attribute__((address_space(5))) float *); +__device__ float __ocml_modf_f32(float, __PRIVATE_AS float *); __device__ __attribute__((const)) float __ocml_nearbyint_f32(float); __device__ __attribute__((const)) float __ocml_nextafter_f32(float, float); __device__ __attribute__((const)) float __ocml_len3_f32(float, float, float); @@ -87,8 +87,7 @@ __device__ __attribute__((pure)) float __ocml_pow_f32(float, float); __device__ __attribute__((pure)) float __ocml_pown_f32(float, int); __device__ __attribute__((pure)) float __ocml_rcbrt_f32(float); __device__ __attribute__((const)) float __ocml_remainder_f32(float, float); -__device__ float __ocml_remquo_f32(float, float, - __attribute__((address_space(5))) int *); +__device__ float __ocml_remquo_f32(float, float, __PRIVATE_AS int *); __device__ __attribute__((const)) float __ocml_rhypot_f32(float, float); __device__ __attribute__((const)) float __ocml_rint_f32(float); __device__ __attribute__((const)) float __ocml_rlen3_f32(float, float, float); @@ -99,10 +98,8 @@ __device__ __attribute__((pure)) float __ocml_rsqrt_f32(float); __device__ __attribute__((const)) float __ocml_scalb_f32(float, float); __device__ __attribute__((const)) float __ocml_scalbn_f32(float, int); __device__ __attribute__((const)) int __ocml_signbit_f32(float); -__device__ float __ocml_sincos_f32(float, - __attribute__((address_space(5))) float *); -__device__ float __ocml_sincospi_f32(float, - __attribute__((address_space(5))) float *); +__device__ float __ocml_sincos_f32(float, __PRIVATE_AS float *); +__device__ float __ocml_sincospi_f32(float, __PRIVATE_AS float *); __device__ float __ocml_sin_f32(float); __device__ float __ocml_native_sin_f32(float); __device__ __attribute__((pure)) float __ocml_sinh_f32(float); @@ -176,8 +173,7 @@ __device__ __attribute__((const)) double __ocml_fma_f64(double, double, double); __device__ __attribute__((const)) double __ocml_fmax_f64(double, double); __device__ __attribute__((const)) double __ocml_fmin_f64(double, double); __device__ __attribute__((const)) double __ocml_fmod_f64(double, double); -__device__ double __ocml_frexp_f64(double, - __attribute__((address_space(5))) int *); +__device__ double __ocml_frexp_f64(double, __PRIVATE_AS int *); __device__ __attribute__((const)) double __ocml_hypot_f64(double, double); __device__ __attribute__((const)) int __ocml_ilogb_f64(double); __device__ __attribute__((const)) int __ocml_isfinite_f64(double); @@ -192,8 +188,7 @@ __device__ __attribute__((pure)) double __ocml_log1p_f64(double); __device__ __attribute__((pure)) double __ocml_log2_f64(double); __device__ __attribute__((const)) double __ocml_logb_f64(double); __device__ __attribute__((pure)) double __ocml_log_f64(double); -__device__ double __ocml_modf_f64(double, - __attribute__((address_space(5))) double *); +__device__ double __ocml_modf_f64(double, __PRIVATE_AS double *); __device__ __attribute__((const)) double __ocml_nearbyint_f64(double); __device__ __attribute__((const)) double __ocml_nextafter_f64(double, double); __device__ __attribute__((const)) double __ocml_len3_f64(double, double, @@ -206,8 +201,7 @@ __device__ __attribute__((pure)) double __ocml_pow_f64(double, double); __device__ __attribute__((pure)) double __ocml_pown_f64(double, int); __device__ __attribute__((pure)) double __ocml_rcbrt_f64(double); __device__ __attribute__((const)) double __ocml_remainder_f64(double, double); -__device__ double __ocml_remquo_f64(double, double, - __attribute__((address_space(5))) int *); +__device__ double __ocml_remquo_f64(double, double, __PRIVATE_AS int *); __device__ __attribute__((const)) double __ocml_rhypot_f64(double, double); __device__ __attribute__((const)) double __ocml_rint_f64(double); __device__ __attribute__((const)) double __ocml_rlen3_f64(double, double, @@ -219,10 +213,8 @@ __device__ __attribute__((pure)) double __ocml_rsqrt_f64(double); __device__ __attribute__((const)) double __ocml_scalb_f64(double, double); __device__ __attribute__((const)) double __ocml_scalbn_f64(double, int); __device__ __attribute__((const)) int __ocml_signbit_f64(double); -__device__ double __ocml_sincos_f64(double, - __attribute__((address_space(5))) double *); -__device__ double -__ocml_sincospi_f64(double, __attribute__((address_space(5))) double *); +__device__ double __ocml_sincos_f64(double, __PRIVATE_AS double *); +__device__ double __ocml_sincospi_f64(double, __PRIVATE_AS double *); __device__ double __ocml_sin_f64(double); __device__ __attribute__((pure)) double __ocml_sinh_f64(double); __device__ double __ocml_sinpi_f64(double); diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h index 9e388558be3ee..431feed1e54c6 100644 --- a/clang/lib/Headers/__clang_hip_math.h +++ b/clang/lib/Headers/__clang_hip_math.h @@ -55,6 +55,9 @@ #define __DEVICE_NOCE__ __DEVICE__ #endif +#pragma push_macro("__PRIVATE_AS") + +#define __PRIVATE_AS __attribute__((opencl_private)) // Device library provides fast low precision and slow full-recision // implementations for some functions. Which one gets selected depends on // __CLANG_GPU_APPROX_TRANSCENDENTALS__ which gets defined by clang if @@ -539,8 +542,7 @@ float modff(float __x, float *__iptr) { #ifdef __OPENMP_AMDGCN__ #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) #endif - float __r = - __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); + float __r = __ocml_modf_f32(__x, (__PRIVATE_AS float *)&__tmp); *__iptr = __tmp; return __r; } @@ -625,8 +627,7 @@ float remquof(float __x, float __y, int *__quo) { #ifdef __OPENMP_AMDGCN__ #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) #endif - float __r = __ocml_remquo_f32( - __x, __y, (__attribute__((address_space(5))) int *)&__tmp); + float __r = __ocml_remquo_f32(__x, __y, (__PRIVATE_AS int *)&__tmp); *__quo = __tmp; return __r; @@ -687,8 +688,7 @@ void sincosf(float __x, float *__sinptr, float *__cosptr) { #ifdef __CLANG_CUDA_APPROX_TRANSCENDENTALS__ __sincosf(__x, __sinptr, __cosptr); #else - *__sinptr = - __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); + *__sinptr = __ocml_sincos_f32(__x, (__PRIVATE_AS float *)&__tmp); *__cosptr = __tmp; #endif } @@ -699,8 +699,7 @@ void sincospif(float __x, float *__sinptr, float *__cosptr) { #ifdef __OPENMP_AMDGCN__ #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) #endif - *__sinptr = __ocml_sincospi_f32( - __x, (__attribute__((address_space(5))) float *)&__tmp); + *__sinptr = __ocml_sincospi_f32(__x, (__PRIVATE_AS float *)&__tmp); *__cosptr = __tmp; } @@ -943,8 +942,7 @@ double modf(double __x, double *__iptr) { #ifdef __OPENMP_AMDGCN__ #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) #endif - double __r = - __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp); + double __r = __ocml_modf_f64(__x, (__PRIVATE_AS double *)&__tmp); *__iptr = __tmp; return __r; @@ -1037,8 +1035,7 @@ double remquo(double __x, double __y, int *__quo) { #ifdef __OPENMP_AMDGCN__ #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) #endif - double __r = __ocml_remquo_f64( - __x, __y, (__attribute__((address_space(5))) int *)&__tmp); + double __r = __ocml_remquo_f64(__x, __y, (__PRIVATE_AS int *)&__tmp); *__quo = __tmp; return __r; @@ -1098,8 +1095,7 @@ void sincos(double __x, double *__sinptr, double *__cosptr) { #ifdef __OPENMP_AMDGCN__ #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) #endif - *__sinptr = __ocml_sincos_f64( - __x, (__attribute__((address_space(5))) double *)&__tmp); + *__sinptr = __ocml_sincos_f64(__x, (__PRIVATE_AS double *)&__tmp); *__cosptr = __tmp; } @@ -1109,8 +1105,7 @@ void sincospi(double __x, double *__sinptr, double *__cosptr) { #ifdef __OPENMP_AMDGCN__ #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) #endif - *__sinptr = __ocml_sincospi_f64( - __x, (__attribute__((address_space(5))) double *)&__tmp); + *__sinptr = __ocml_sincospi_f64(__x, (__PRIVATE_AS double *)&__tmp); *__cosptr = __tmp; } @@ -1358,6 +1353,7 @@ __host__ inline static int max(int __arg1, int __arg2) { #pragma pop_macro("__DEVICE_NOCE__") #pragma pop_macro("__DEVICE__") +#pragma pop_macro("__PRIVATE_AS") #pragma pop_macro("__RETURN_TYPE") #pragma pop_macro("__FAST_OR_SLOW") diff --git a/clang/lib/Headers/cuda_wrappers/bits/c++config.h b/clang/lib/Headers/cuda_wrappers/bits/c++config.h new file mode 100644 index 0000000000000..27083253181d2 --- /dev/null +++ b/clang/lib/Headers/cuda_wrappers/bits/c++config.h @@ -0,0 +1,61 @@ +// libstdc++ uses the non-constexpr function std::__glibcxx_assert_fail() +// to trigger compilation errors when the __glibcxx_assert(cond) macro +// is used in a constexpr context. +// Compilation fails when using code from the libstdc++ (such as std::array) on +// device code, since these assertions invoke a non-constexpr host function from +// device code. +// +// To work around this issue, we declare our own device version of the function + +#ifndef __CLANG_CUDA_WRAPPERS_BITS_CPP_CONFIG +#define __CLANG_CUDA_WRAPPERS_BITS_CPP_CONFIG + +#include_next + +#ifdef _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_NAMESPACE_STD +#else +namespace std { +#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION +_GLIBCXX_BEGIN_NAMESPACE_VERSION +#endif + +#pragma push_macro("CUDA_NOEXCEPT") +#if __cplusplus >= 201103L +#define CUDA_NOEXCEPT noexcept +#else +#define CUDA_NOEXCEPT +#endif + +__attribute__((device, noreturn)) inline void +__glibcxx_assert_fail(const char *file, int line, const char *function, + const char *condition) CUDA_NOEXCEPT { +#ifdef _GLIBCXX_VERBOSE_ASSERT + if (file && function && condition) + __builtin_printf("%s:%d: %s: Assertion '%s' failed.\n", file, line, + function, condition); + else if (function) + __builtin_printf("%s: Undefined behavior detected.\n", function); +#endif + __builtin_abort(); +} + +#endif +__attribute__((device, noreturn, __always_inline__, + __visibility__("default"))) inline void +__glibcxx_assert_fail() CUDA_NOEXCEPT { + __builtin_abort(); +} + +#pragma pop_macro("CUDA_NOEXCEPT") + +#ifdef _LIBCPP_END_NAMESPACE_STD +_LIBCPP_END_NAMESPACE_STD +#else +#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION +_GLIBCXX_END_NAMESPACE_VERSION +#endif +} // namespace std +#endif + +#endif diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp index 4b92d67e49d7d..7c0c3ca3125a8 100644 --- a/clang/lib/Sema/JumpDiagnostics.cpp +++ b/clang/lib/Sema/JumpDiagnostics.cpp @@ -19,6 +19,7 @@ #include "clang/AST/StmtOpenACC.h" #include "clang/AST/StmtOpenMP.h" #include "clang/Basic/SourceLocation.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaInternal.h" #include "llvm/ADT/BitVector.h" using namespace clang; @@ -368,8 +369,10 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S, case Stmt::IfStmtClass: { IfStmt *IS = cast(S); + bool AMDGPUPredicate = false; if (!(IS->isConstexpr() || IS->isConsteval() || - IS->isObjCAvailabilityCheck())) + IS->isObjCAvailabilityCheck() || + (AMDGPUPredicate = this->S.AMDGPU().IsPredicate(IS->getCond())))) break; unsigned Diag = diag::note_protected_by_if_available; @@ -377,6 +380,8 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S, Diag = diag::note_protected_by_constexpr_if; else if (IS->isConsteval()) Diag = diag::note_protected_by_consteval_if; + else if (AMDGPUPredicate) + Diag = diag::note_amdgcn_protected_by_predicate; if (VarDecl *Var = IS->getConditionVariable()) BuildScopeInformation(Var, ParentScope); diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 9507d7602aa40..7c88c10b83580 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -506,8 +506,13 @@ void Sema::Initialize() { } if (Context.getTargetInfo().getTriple().isAMDGPU() || + (Context.getTargetInfo().getTriple().isSPIRV() && + Context.getTargetInfo().getTriple().getVendor() == llvm::Triple::AMD) || (Context.getAuxTargetInfo() && - Context.getAuxTargetInfo()->getTriple().isAMDGPU())) { + (Context.getAuxTargetInfo()->getTriple().isAMDGPU() || + (Context.getAuxTargetInfo()->getTriple().isSPIRV() && + Context.getAuxTargetInfo()->getTriple().getVendor() == + llvm::Triple::AMD)))) { #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) \ addImplicitTypedef(Name, Context.SingletonId); #include "clang/Basic/AMDGPUTypes.def" diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index a4d075dfd0768..85771e79e8e16 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -13,6 +13,7 @@ #include "clang/Sema/SemaAMDGPU.h" #include "clang/Basic/DiagnosticSema.h" #include "clang/Basic/TargetBuiltins.h" +#include "clang/Basic/TargetInfo.h" #include "clang/Sema/Ownership.h" #include "clang/Sema/Sema.h" #include "llvm/Support/AtomicOrdering.h" @@ -35,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, Builtin::evaluateRequiredTargetFeatures("gfx950-insts", CallerFeatureMap); switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds: case AMDGPU::BI__builtin_amdgcn_global_load_lds: { constexpr const int SizeIdx = 2; llvm::APSInt Size; @@ -54,11 +56,9 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, [[fallthrough]]; } default: - Diag(ArgExpr->getExprLoc(), - diag::err_amdgcn_global_load_lds_size_invalid_value) + Diag(ArgExpr->getExprLoc(), diag::err_amdgcn_load_lds_size_invalid_value) << ArgExpr->getSourceRange(); - Diag(ArgExpr->getExprLoc(), - diag::note_amdgcn_global_load_lds_size_valid_value) + Diag(ArgExpr->getExprLoc(), diag::note_amdgcn_load_lds_size_valid_value) << HasGFX950Insts << ArgExpr->getSourceRange(); return true; } @@ -367,4 +367,80 @@ void SemaAMDGPU::handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, addAMDGPUMaxNumWorkGroupsAttr(D, AL, AL.getArgAsExpr(0), YExpr, ZExpr); } +Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { + ASTContext &Ctx = getASTContext(); + QualType BoolTy = Ctx.getLogicalOperationType(); + llvm::APInt False = llvm::APInt::getZero(Ctx.getIntWidth(BoolTy)); + llvm::APInt True = llvm::APInt::getAllOnes(Ctx.getIntWidth(BoolTy)); + SourceLocation Loc = CE->getExprLoc(); + + if (!CE->getBuiltinCallee()) + return *ExpandedPredicates + .insert(IntegerLiteral::Create(Ctx, False, BoolTy, Loc)) + .first; + + bool P = false; + unsigned BI = CE->getBuiltinCallee(); + if (Ctx.BuiltinInfo.isAuxBuiltinID(BI)) + BI = Ctx.BuiltinInfo.getAuxBuiltinID(BI); + + if (BI == AMDGPU::BI__builtin_amdgcn_processor_is) { + auto *GFX = dyn_cast(CE->getArg(0)->IgnoreParenCasts()); + if (!GFX) { + Diag(Loc, diag::err_amdgcn_processor_is_arg_not_literal); + return nullptr; + } + + StringRef N = GFX->getString(); + const TargetInfo &TI = Ctx.getTargetInfo(); + const TargetInfo *AuxTI = Ctx.getAuxTargetInfo(); + if (!TI.isValidCPUName(N) && (!AuxTI || !AuxTI->isValidCPUName(N))) { + Diag(Loc, diag::err_amdgcn_processor_is_arg_invalid_value) << N; + SmallVector ValidList; + if (TI.getTriple().getVendor() == llvm::Triple::VendorType::AMD) + TI.fillValidCPUList(ValidList); + else if (AuxTI) // Since the BI is present it must be and AMDGPU triple. + AuxTI->fillValidCPUList(ValidList); + if (!ValidList.empty()) + Diag(Loc, diag::note_amdgcn_processor_is_valid_options) + << llvm::join(ValidList, ", "); + return nullptr; + } + if (Ctx.getTargetInfo().getTriple().isSPIRV()) { + CE->setType(BoolTy); + return *ExpandedPredicates.insert(CE).first; + } + + if (auto TID = Ctx.getTargetInfo().getTargetID()) + P = TID->find(N) == 0; + } else { + Expr *Arg = CE->getArg(0); + if (!Arg || Arg->getType() != Ctx.BuiltinFnTy) { + Diag(Loc, diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg; + return nullptr; + } + + if (Ctx.getTargetInfo().getTriple().isSPIRV()) { + CE->setType(BoolTy); + return *ExpandedPredicates.insert(CE).first; + } + + auto *FD = cast(Arg->getReferencedDeclOfCallee()); + + StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); + llvm::StringMap CF; + Ctx.getFunctionFeatureMap(CF, FD); + + P = Builtin::evaluateRequiredTargetFeatures(RF, CF); + } + + return *ExpandedPredicates + .insert( + IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc)) + .first; +} + +bool SemaAMDGPU::IsPredicate(Expr *E) const { + return ExpandedPredicates.contains(E); +} } // namespace clang diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp index 0e1bf727d72d2..1e4825312f738 100644 --- a/clang/lib/Sema/SemaCUDA.cpp +++ b/clang/lib/Sema/SemaCUDA.cpp @@ -18,6 +18,7 @@ #include "clang/Basic/TargetInfo.h" #include "clang/Lex/Preprocessor.h" #include "clang/Sema/Lookup.h" +#include "clang/Sema/Overload.h" #include "clang/Sema/ScopeInfo.h" #include "clang/Sema/Sema.h" #include "clang/Sema/Template.h" @@ -1081,3 +1082,49 @@ std::string SemaCUDA::getConfigureFuncName() const { // Legacy CUDA kernel configuration call return "cudaConfigureCall"; } + +// Record any local constexpr variables that are passed one way on the host +// and another on the device. +void SemaCUDA::recordPotentialODRUsedVariable( + MultiExprArg Arguments, OverloadCandidateSet &Candidates) { + sema::LambdaScopeInfo *LambdaInfo = SemaRef.getCurLambda(); + if (!LambdaInfo) + return; + + for (unsigned I = 0; I < Arguments.size(); ++I) { + auto *DeclRef = dyn_cast(Arguments[I]); + if (!DeclRef) + continue; + auto *Variable = dyn_cast(DeclRef->getDecl()); + if (!Variable || !Variable->isLocalVarDecl() || !Variable->isConstexpr()) + continue; + + bool HostByValue = false, HostByRef = false; + bool DeviceByValue = false, DeviceByRef = false; + + for (OverloadCandidate &Candidate : Candidates) { + FunctionDecl *Callee = Candidate.Function; + if (!Callee || I >= Callee->getNumParams()) + continue; + + CUDAFunctionTarget Target = IdentifyTarget(Callee); + if (Target == CUDAFunctionTarget::InvalidTarget || + Target == CUDAFunctionTarget::Global) + continue; + + bool CoversHost = (Target == CUDAFunctionTarget::Host || + Target == CUDAFunctionTarget::HostDevice); + bool CoversDevice = (Target == CUDAFunctionTarget::Device || + Target == CUDAFunctionTarget::HostDevice); + + bool IsRef = Callee->getParamDecl(I)->getType()->isReferenceType(); + HostByValue |= CoversHost && !IsRef; + HostByRef |= CoversHost && IsRef; + DeviceByValue |= CoversDevice && !IsRef; + DeviceByRef |= CoversDevice && IsRef; + } + + if ((HostByValue && DeviceByRef) || (HostByRef && DeviceByValue)) + LambdaInfo->CUDAPotentialODRUsedVars.insert(Variable); + } +} diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index 54bc52fa2ac40..e260c8bceb8f6 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -22,6 +22,7 @@ #include "clang/Basic/PartialDiagnostic.h" #include "clang/Basic/TargetInfo.h" #include "clang/Lex/Preprocessor.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/Initialization.h" #include "clang/Sema/SemaObjC.h" #include "clang/Sema/SemaRISCV.h" @@ -1525,6 +1526,22 @@ static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr, return TC_Success; } + if (SrcType == Self.Context.AMDGPUFeaturePredicateTy && + DestType == Self.Context.getLogicalOperationType()) { + SrcExpr = Self.AMDGPU().ExpandAMDGPUPredicateBI( + dyn_cast(SrcExpr.get())); + Kind = CK_NoOp; + return TC_Success; + } + + if (SrcType == Self.Context.AMDGPUFeaturePredicateTy && + DestType == Self.Context.getLogicalOperationType()) { + SrcExpr = Self.AMDGPU().ExpandAMDGPUPredicateBI( + dyn_cast(SrcExpr.get())); + Kind = CK_NoOp; + return TC_Success; + } + // We tried everything. Everything! Nothing works! :-( return TC_NotApplicable; } diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index a324dca3e59d9..55691589a6401 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -8141,7 +8141,7 @@ NamedDecl *Sema::ActOnVariableDeclarator( (D.getCXXScopeSpec().isSet() && DC && DC->isRecord() && DC->isDependentContext()) ? TPC_ClassTemplateMember - : TPC_VarTemplate)) + : TPC_Other)) NewVD->setInvalidDecl(); // If we are providing an explicit specialization of a static variable @@ -13402,6 +13402,26 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { return; } + // __amdgpu_feature_predicate_t cannot be initialised + if (VDecl->getType().getDesugaredType(Context) == + Context.AMDGPUFeaturePredicateTy) { + Diag(VDecl->getLocation(), + diag::err_amdgcn_predicate_type_is_not_constructible) + << VDecl; + VDecl->setInvalidDecl(); + return; + } + + // __amdgpu_feature_predicate_t cannot be initialised + if (VDecl->getType().getDesugaredType(Context) == + Context.AMDGPUFeaturePredicateTy) { + Diag(VDecl->getLocation(), + diag::err_amdgcn_predicate_type_is_not_constructible) + << VDecl; + VDecl->setInvalidDecl(); + return; + } + // WebAssembly tables can't be used to initialise a variable. if (!Init->getType().isNull() && Init->getType()->isWebAssemblyTableType()) { Diag(Init->getExprLoc(), diag::err_wasm_table_art) << 0; @@ -13931,6 +13951,13 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) { if (VarDecl *Var = dyn_cast(RealDecl)) { QualType Type = Var->getType(); + if (Type.getDesugaredType(Context) == Context.AMDGPUFeaturePredicateTy) { + Diag(Var->getLocation(), + diag::err_amdgcn_predicate_type_is_not_constructible) + << Var; + Var->setInvalidDecl(); + return; + } // C++1z [dcl.dcl]p1 grammar implies that an initializer is mandatory. if (isa(RealDecl)) { Diag(Var->getLocation(), diag::err_decomp_decl_requires_init) << Var; diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index e4e3bbad1f520..85de46c9adab4 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -13533,7 +13533,7 @@ Decl *Sema::ActOnAliasDeclaration(Scope *S, AccessSpecifier AS, // Merge any previous default template arguments into our parameters, // and check the parameter list. if (CheckTemplateParameterList(TemplateParams, OldTemplateParams, - TPC_TypeAliasTemplate)) + TPC_Other)) return nullptr; TypeAliasTemplateDecl *NewDecl = diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index e253e3a17328f..54f4c541c7fa9 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -50,6 +50,7 @@ #include "clang/Sema/ParsedTemplate.h" #include "clang/Sema/Scope.h" #include "clang/Sema/ScopeInfo.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaFixItUtils.h" #include "clang/Sema/SemaHLSL.h" @@ -6482,6 +6483,22 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, if (Result.isInvalid()) return ExprError(); Fn = Result.get(); + // The __builtin_amdgcn_is_invocable builtin is special, and will be resolved + // later, when we check boolean conditions, for now we merely forward it + // without any additional checking. + if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 && + ArgExprs[0]->getType() == Context.BuiltinFnTy) { + const auto *FD = cast(Fn->getReferencedDeclOfCallee()); + + if (FD->getName() == "__builtin_amdgcn_is_invocable") { + QualType FnPtrTy = Context.getPointerType(FD->getType()); + Expr *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get(); + return CallExpr::Create( + Context, R, ArgExprs, Context.AMDGPUFeaturePredicateTy, + ExprValueKind::VK_PRValue, RParenLoc, FPOptionsOverride()); + } + } + if (CheckArgsForPlaceholders(ArgExprs)) return ExprError(); @@ -13165,6 +13182,20 @@ inline QualType Sema::CheckBitwiseOperands(ExprResult &LHS, ExprResult &RHS, return InvalidOperands(Loc, LHS, RHS); } +static inline bool IsAMDGPUPredicateBI(Expr *E) { + if (!E->getType()->isVoidType()) + return false; + + if (auto *CE = dyn_cast(E)) { + if (auto *BI = CE->getDirectCallee()) + if (BI->getName() == "__builtin_amdgcn_processor_is" || + BI->getName() == "__builtin_amdgcn_is_invocable") + return true; + } + + return false; +} + // C99 6.5.[13,14] inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS, SourceLocation Loc, @@ -13260,6 +13291,9 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS, // The following is safe because we only use this method for // non-overloadable operands. + if (IsAMDGPUPredicateBI(LHS.get()) && IsAMDGPUPredicateBI(RHS.get())) + return Context.VoidTy; + // C++ [expr.log.and]p1 // C++ [expr.log.or]p1 // The operands are both contextually converted to type bool. @@ -15683,6 +15717,10 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc, // Vector logical not returns the signed variant of the operand type. resultType = GetSignedVectorType(resultType); break; + } else if (resultType == Context.AMDGPUFeaturePredicateTy) { + resultType = Context.getLogicalOperationType(); + Input = AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast(InputExpr)); + break; } else { return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr) << resultType << Input.get()->getSourceRange()); @@ -19385,11 +19423,29 @@ static ExprResult rebuildPotentialResultsAsNonOdrUsed(Sema &S, Expr *E, return false; }; + // Check whether this expression may be odr-used in CUDA/HIP. + auto MaybeCUDAODRUsed = [&]() -> bool { + if (!S.LangOpts.CUDA) + return false; + LambdaScopeInfo *LSI = S.getCurLambda(); + if (!LSI) + return false; + auto *DRE = dyn_cast(E); + if (!DRE) + return false; + auto *VD = dyn_cast(DRE->getDecl()); + if (!VD) + return false; + return LSI->CUDAPotentialODRUsedVars.count(VD); + }; + // Mark that this expression does not constitute an odr-use. auto MarkNotOdrUsed = [&] { - S.MaybeODRUseExprs.remove(E); - if (LambdaScopeInfo *LSI = S.getCurLambda()) - LSI->markVariableExprAsNonODRUsed(E); + if (!MaybeCUDAODRUsed()) { + S.MaybeODRUseExprs.remove(E); + if (LambdaScopeInfo *LSI = S.getCurLambda()) + LSI->markVariableExprAsNonODRUsed(E); + } }; // C++2a [basic.def.odr]p2: @@ -20406,6 +20462,9 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E, E = result.get(); if (!E->isTypeDependent()) { + if (E->getType() == Context.AMDGPUFeaturePredicateTy) + return AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast_or_null(E)); + if (getLangOpts().CPlusPlus) return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4 diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp index d130e8b86bc56..b8a14a6d60ca3 100644 --- a/clang/lib/Sema/SemaExprMember.cpp +++ b/clang/lib/Sema/SemaExprMember.cpp @@ -987,6 +987,8 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType, // arrow operator was used with a dependent non-pointer object expression, // build a CXXDependentScopeMemberExpr. if (R.wasNotFoundInCurrentInstantiation() || + (IsArrow && !BaseExprType->isPointerType() && + BaseExprType->isDependentType()) || (R.getLookupName().getCXXOverloadedOperator() == OO_Equal && (SS.isSet() ? SS.getScopeRep()->isDependent() : BaseExprType->isDependentType()))) diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index b95cbbf422205..aa25da49e965f 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -8988,6 +8988,15 @@ bool InitializationSequence::Diagnose(Sema &S, case FK_ConversionFailed: { QualType FromType = OnlyArg->getType(); + // __amdgpu_feature_predicate_t can be explicitly cast to the logical op + // type, although this is almost always an error and we advise against it. + if (FromType == S.Context.AMDGPUFeaturePredicateTy && + DestType == S.Context.getLogicalOperationType()) { + S.Diag(OnlyArg->getExprLoc(), + diag::err_amdgcn_predicate_type_needs_explicit_bool_cast) + << OnlyArg << DestType; + break; + } PartialDiagnostic PDiag = S.PDiag(diag::err_init_conversion_failed) << (int)Entity.getKind() << DestType @@ -9782,6 +9791,14 @@ Sema::PerformCopyInitialization(const InitializedEntity &Entity, if (EqualLoc.isInvalid()) EqualLoc = InitE->getBeginLoc(); + if (Entity.getType().getDesugaredType(Context) == + Context.AMDGPUFeaturePredicateTy && + Entity.getDecl()) { + Diag(EqualLoc, diag::err_amdgcn_predicate_type_is_not_constructible) + << Entity.getDecl(); + return ExprError(); + } + InitializationKind Kind = InitializationKind::CreateCopy( InitE->getBeginLoc(), EqualLoc, AllowExplicit); InitializationSequence Seq(*this, Entity, Kind, InitE, TopLevelOfInitList); diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 1af2924166774..7cecd74dec302 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -12159,51 +12159,61 @@ bool OpenMPAtomicCompareChecker::checkCondUpdateStmt(IfStmt *S, X = BO->getLHS(); auto *Cond = dyn_cast(S->getCond()); - if (!Cond) { + auto *Call = dyn_cast(S->getCond()); + Expr *LHS = nullptr; + Expr *RHS = nullptr; + if (Cond) { + LHS = Cond->getLHS(); + RHS = Cond->getRHS(); + } else if (Call) { + LHS = Call->getArg(0); + RHS = Call->getArg(1); + } else { ErrorInfo.Error = ErrorTy::NotABinaryOp; ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = S->getCond()->getExprLoc(); ErrorInfo.ErrorRange = ErrorInfo.NoteRange = S->getCond()->getSourceRange(); return false; } - switch (Cond->getOpcode()) { - case BO_EQ: { - C = Cond; + if ((Cond && Cond->getOpcode() == BO_EQ) || + (Call && Call->getOperator() == OverloadedOperatorKind::OO_EqualEqual)) { + C = S->getCond(); D = BO->getRHS(); - if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS())) { - E = Cond->getRHS(); - } else if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getRHS())) { - E = Cond->getLHS(); + if (checkIfTwoExprsAreSame(ContextRef, X, LHS)) { + E = RHS; + } else if (checkIfTwoExprsAreSame(ContextRef, X, RHS)) { + E = LHS; } else { ErrorInfo.Error = ErrorTy::InvalidComparison; - ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc(); - ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange(); + ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = S->getCond()->getExprLoc(); + ErrorInfo.ErrorRange = ErrorInfo.NoteRange = + S->getCond()->getSourceRange(); return false; } - break; - } - case BO_LT: - case BO_GT: { + } else if ((Cond && + (Cond->getOpcode() == BO_LT || Cond->getOpcode() == BO_GT)) || + (Call && + (Call->getOperator() == OverloadedOperatorKind::OO_Less || + Call->getOperator() == OverloadedOperatorKind::OO_Greater))) { E = BO->getRHS(); - if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS()) && - checkIfTwoExprsAreSame(ContextRef, E, Cond->getRHS())) { - C = Cond; - } else if (checkIfTwoExprsAreSame(ContextRef, E, Cond->getLHS()) && - checkIfTwoExprsAreSame(ContextRef, X, Cond->getRHS())) { - C = Cond; + if (checkIfTwoExprsAreSame(ContextRef, X, LHS) && + checkIfTwoExprsAreSame(ContextRef, E, RHS)) { + C = S->getCond(); + } else if (checkIfTwoExprsAreSame(ContextRef, E, LHS) && + checkIfTwoExprsAreSame(ContextRef, X, RHS)) { + C = S->getCond(); IsXBinopExpr = false; } else { ErrorInfo.Error = ErrorTy::InvalidComparison; - ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc(); - ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange(); + ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = S->getCond()->getExprLoc(); + ErrorInfo.ErrorRange = ErrorInfo.NoteRange = + S->getCond()->getSourceRange(); return false; } - break; - } - default: + } else { ErrorInfo.Error = ErrorTy::InvalidBinaryOp; - ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc(); - ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange(); + ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = S->getCond()->getExprLoc(); + ErrorInfo.ErrorRange = ErrorInfo.NoteRange = S->getCond()->getSourceRange(); return false; } @@ -12253,7 +12263,16 @@ bool OpenMPAtomicCompareChecker::checkCondExprStmt(Stmt *S, } auto *Cond = dyn_cast(CO->getCond()); - if (!Cond) { + auto *Call = dyn_cast(CO->getCond()); + Expr *LHS = nullptr; + Expr *RHS = nullptr; + if (Cond) { + LHS = Cond->getLHS(); + RHS = Cond->getRHS(); + } else if (Call) { + LHS = Call->getArg(0); + RHS = Call->getArg(1); + } else { ErrorInfo.Error = ErrorTy::NotABinaryOp; ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = CO->getCond()->getExprLoc(); ErrorInfo.ErrorRange = ErrorInfo.NoteRange = @@ -12261,44 +12280,47 @@ bool OpenMPAtomicCompareChecker::checkCondExprStmt(Stmt *S, return false; } - switch (Cond->getOpcode()) { - case BO_EQ: { - C = Cond; + if ((Cond && Cond->getOpcode() == BO_EQ) || + (Call && Call->getOperator() == OverloadedOperatorKind::OO_EqualEqual)) { + C = CO->getCond(); D = CO->getTrueExpr(); - if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS())) { - E = Cond->getRHS(); - } else if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getRHS())) { - E = Cond->getLHS(); + if (checkIfTwoExprsAreSame(ContextRef, X, LHS)) { + E = RHS; + } else if (checkIfTwoExprsAreSame(ContextRef, X, RHS)) { + E = LHS; } else { ErrorInfo.Error = ErrorTy::InvalidComparison; - ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc(); - ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange(); + ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = CO->getCond()->getExprLoc(); + ErrorInfo.ErrorRange = ErrorInfo.NoteRange = + CO->getCond()->getSourceRange(); return false; } - break; - } - case BO_LT: - case BO_GT: { + } else if ((Cond && + (Cond->getOpcode() == BO_LT || Cond->getOpcode() == BO_GT)) || + (Call && + (Call->getOperator() == OverloadedOperatorKind::OO_Less || + Call->getOperator() == OverloadedOperatorKind::OO_Greater))) { + E = CO->getTrueExpr(); - if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS()) && - checkIfTwoExprsAreSame(ContextRef, E, Cond->getRHS())) { - C = Cond; - } else if (checkIfTwoExprsAreSame(ContextRef, E, Cond->getLHS()) && - checkIfTwoExprsAreSame(ContextRef, X, Cond->getRHS())) { - C = Cond; + if (checkIfTwoExprsAreSame(ContextRef, X, LHS) && + checkIfTwoExprsAreSame(ContextRef, E, RHS)) { + C = CO->getCond(); + } else if (checkIfTwoExprsAreSame(ContextRef, E, LHS) && + checkIfTwoExprsAreSame(ContextRef, X, RHS)) { + C = CO->getCond(); IsXBinopExpr = false; } else { ErrorInfo.Error = ErrorTy::InvalidComparison; - ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc(); - ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange(); + ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = CO->getCond()->getExprLoc(); + ErrorInfo.ErrorRange = ErrorInfo.NoteRange = + CO->getCond()->getSourceRange(); return false; } - break; - } - default: + } else { ErrorInfo.Error = ErrorTy::InvalidBinaryOp; - ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc(); - ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange(); + ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = CO->getCond()->getExprLoc(); + ErrorInfo.ErrorRange = ErrorInfo.NoteRange = + CO->getCond()->getSourceRange(); return false; } @@ -12459,31 +12481,41 @@ bool OpenMPAtomicCompareCaptureChecker::checkForm3(IfStmt *S, D = BO->getRHS(); auto *Cond = dyn_cast(S->getCond()); - if (!Cond) { + auto *Call = dyn_cast(S->getCond()); + Expr *LHS = nullptr; + Expr *RHS = nullptr; + if (Cond) { + LHS = Cond->getLHS(); + RHS = Cond->getRHS(); + } else if (Call) { + LHS = Call->getArg(0); + RHS = Call->getArg(1); + } else { ErrorInfo.Error = ErrorTy::NotABinaryOp; ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = S->getCond()->getExprLoc(); ErrorInfo.ErrorRange = ErrorInfo.NoteRange = S->getCond()->getSourceRange(); return false; } - if (Cond->getOpcode() != BO_EQ) { + if ((Cond && Cond->getOpcode() != BO_EQ) || + (Call && Call->getOperator() != OverloadedOperatorKind::OO_EqualEqual)) { ErrorInfo.Error = ErrorTy::NotEQ; - ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc(); - ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange(); + ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = S->getCond()->getExprLoc(); + ErrorInfo.ErrorRange = ErrorInfo.NoteRange = S->getCond()->getSourceRange(); return false; } - if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS())) { - E = Cond->getRHS(); - } else if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getRHS())) { - E = Cond->getLHS(); + if (checkIfTwoExprsAreSame(ContextRef, X, LHS)) { + E = RHS; + } else if (checkIfTwoExprsAreSame(ContextRef, X, RHS)) { + E = LHS; } else { ErrorInfo.Error = ErrorTy::InvalidComparison; - ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc(); - ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange(); + ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = S->getCond()->getExprLoc(); + ErrorInfo.ErrorRange = ErrorInfo.NoteRange = S->getCond()->getSourceRange(); return false; } - C = Cond; + C = S->getCond(); if (!S->getElse()) { ErrorInfo.Error = ErrorTy::NoElse; diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 16ecea67aea97..6268ac2f241d0 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -30,6 +30,7 @@ #include "clang/Sema/Initialization.h" #include "clang/Sema/Lookup.h" #include "clang/Sema/Overload.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaObjC.h" #include "clang/Sema/Template.h" @@ -6059,12 +6060,13 @@ TryContextuallyConvertToBool(Sema &S, Expr *From) { ExprResult Sema::PerformContextuallyConvertToBool(Expr *From) { if (checkPlaceholderForOverload(*this, From)) return ExprError(); + if (From->getType() == Context.AMDGPUFeaturePredicateTy) + return AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast(From)); ImplicitConversionSequence ICS = TryContextuallyConvertToBool(*this, From); if (!ICS.isBad()) return PerformImplicitConversion(From, Context.BoolTy, ICS, AssignmentAction::Converting); - if (!DiagnoseMultipleUserDefinedConversion(From, Context.BoolTy)) return Diag(From->getBeginLoc(), diag::err_typecheck_bool_condition) << From->getType() << From->getSourceRange(); @@ -11521,6 +11523,16 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand, if (TakingCandidateAddress && !checkAddressOfCandidateIsAvailable(S, Fn)) return; + // __amdgpu_feature_predicate_t can be explicitly cast to the logical op type, + // although this is almost always an error and we advise against it. + if (FromTy == S.Context.AMDGPUFeaturePredicateTy && + ToTy == S.Context.getLogicalOperationType()) { + S.Diag(Conv.Bad.FromExpr->getExprLoc(), + diag::err_amdgcn_predicate_type_needs_explicit_bool_cast) + << Conv.Bad.FromExpr << ToTy; + return; + } + // Emit the generic diagnostic and, optionally, add the hints to it. PartialDiagnostic FDiag = S.PDiag(diag::note_ovl_candidate_bad_conv); FDiag << (unsigned)FnKindPair.first << (unsigned)FnKindPair.second << FnDesc @@ -14258,6 +14270,8 @@ ExprResult Sema::BuildOverloadedCallExpr(Scope *S, Expr *Fn, // the UnresolvedLookupExpr was type-dependent. if (OverloadResult == OR_Success) { const FunctionDecl *FDecl = Best->Function; + if (LangOpts.CUDA) + CUDA().recordPotentialODRUsedVariable(Args, CandidateSet); if (FDecl && FDecl->isTemplateInstantiation() && FDecl->getReturnType()->isUndeducedType()) { if (const auto *TP = diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 3944c4f67bab9..5870e0ed8e67e 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -1591,8 +1591,16 @@ NamedDecl *Sema::ActOnTemplateTemplateParameter( assert(S->isTemplateParamScope() && "Template template parameter not in template parameter scope!"); - // Construct the parameter object. bool IsParameterPack = EllipsisLoc.isValid(); + + bool Invalid = false; + if (CheckTemplateParameterList( + Params, + /*OldParams=*/nullptr, + IsParameterPack ? TPC_TemplateTemplateParameterPack : TPC_Other)) + Invalid = true; + + // Construct the parameter object. TemplateTemplateParmDecl *Param = TemplateTemplateParmDecl::Create( Context, Context.getTranslationUnitDecl(), NameLoc.isInvalid() ? TmpLoc : NameLoc, Depth, Position, IsParameterPack, @@ -1615,9 +1623,12 @@ NamedDecl *Sema::ActOnTemplateTemplateParameter( if (Params->size() == 0) { Diag(Param->getLocation(), diag::err_template_template_parm_no_parms) << SourceRange(Params->getLAngleLoc(), Params->getRAngleLoc()); - Param->setInvalidDecl(); + Invalid = true; } + if (Invalid) + Param->setInvalidDecl(); + // C++0x [temp.param]p9: // A default template-argument may be specified for any kind of // template-parameter that is not a template parameter pack. @@ -2066,7 +2077,7 @@ DeclResult Sema::CheckClassTemplate( SemanticContext->isDependentContext()) ? TPC_ClassTemplateMember : TUK == TagUseKind::Friend ? TPC_FriendClassTemplate - : TPC_ClassTemplate, + : TPC_Other, SkipBody)) Invalid = true; @@ -2208,9 +2219,8 @@ static bool DiagnoseDefaultTemplateArgument(Sema &S, SourceLocation ParamLoc, SourceRange DefArgRange) { switch (TPC) { - case Sema::TPC_ClassTemplate: - case Sema::TPC_VarTemplate: - case Sema::TPC_TypeAliasTemplate: + case Sema::TPC_Other: + case Sema::TPC_TemplateTemplateParameterPack: return false; case Sema::TPC_FunctionTemplate: @@ -2383,8 +2393,11 @@ bool Sema::CheckTemplateParameterList(TemplateParameterList *NewParams, MissingDefaultArg = true; } else if (NonTypeTemplateParmDecl *NewNonTypeParm = dyn_cast(*NewParam)) { - // Check for unexpanded parameter packs. - if (!NewNonTypeParm->isParameterPack() && + // Check for unexpanded parameter packs, except in a template template + // parameter pack, as in those any unexpanded packs should be expanded + // along with the parameter itself. + if (TPC != TPC_TemplateTemplateParameterPack && + !NewNonTypeParm->isParameterPack() && DiagnoseUnexpandedParameterPack(NewNonTypeParm->getLocation(), NewNonTypeParm->getTypeSourceInfo(), UPPC_NonTypeTemplateParameterType)) { @@ -2492,8 +2505,7 @@ bool Sema::CheckTemplateParameterList(TemplateParameterList *NewParams, // If a template parameter of a primary class template or alias template // is a template parameter pack, it shall be the last template parameter. if (SawParameterPack && (NewParam + 1) != NewParamEnd && - (TPC == TPC_ClassTemplate || TPC == TPC_VarTemplate || - TPC == TPC_TypeAliasTemplate)) { + (TPC == TPC_Other || TPC == TPC_TemplateTemplateParameterPack)) { Diag((*NewParam)->getLocation(), diag::err_template_param_pack_must_be_last_template_parameter); Invalid = true; @@ -2526,8 +2538,8 @@ bool Sema::CheckTemplateParameterList(TemplateParameterList *NewParams, << PrevModuleName; Invalid = true; } else if (MissingDefaultArg && - (TPC == TPC_ClassTemplate || TPC == TPC_FriendClassTemplate || - TPC == TPC_VarTemplate || TPC == TPC_TypeAliasTemplate)) { + (TPC == TPC_Other || TPC == TPC_TemplateTemplateParameterPack || + TPC == TPC_FriendClassTemplate)) { // C++ 23[temp.param]p14: // If a template-parameter of a class template, variable template, or // alias template has a default template argument, each subsequent diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 137942f0c30bf..fea7225e11134 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -3429,9 +3429,9 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction( if (!P.isPackExpansion() && !A.isPackExpansion()) { Info.Param = makeTemplateParameter(Template->getTemplateParameters()->getParam( - (PsStack.empty() ? TemplateArgs.end() - : PsStack.front().begin()) - - TemplateArgs.begin())); + (AsStack.empty() ? CTAI.CanonicalConverted.end() + : AsStack.front().begin()) - + 1 - CTAI.CanonicalConverted.begin())); Info.FirstArg = P; Info.SecondArg = A; return TemplateDeductionResult::NonDeducedMismatch; @@ -6642,17 +6642,19 @@ bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs( TemplateDeductionResult TDK; runWithSufficientStackSpace(Info.getLocation(), [&] { - TDK = ::FinishTemplateArgumentDeduction( - *this, AArg, /*IsPartialOrdering=*/true, PArgs, Deduced, Info); + TDK = ::FinishTemplateArgumentDeduction(*this, AArg, PartialOrdering, PArgs, + Deduced, Info); }); switch (TDK) { case TemplateDeductionResult::Success: return true; // It doesn't seem possible to get a non-deduced mismatch when partial - // ordering TTPs. + // ordering TTPs, except with an invalid template parameter list which has + // a parameter after a pack. case TemplateDeductionResult::NonDeducedMismatch: - llvm_unreachable("Unexpected NonDeducedMismatch"); + assert(PArg->isInvalidDecl() && "Unexpected NonDeducedMismatch"); + return false; // Substitution failures should have already been diagnosed. case TemplateDeductionResult::AlreadyDiagnosed: diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 131f5c8ad1a09..b5c9dbad2a590 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -1826,7 +1826,7 @@ Decl *TemplateDeclInstantiator::VisitClassTemplateDecl(ClassTemplateDecl *D) { // Do some additional validation, then merge default arguments // from the existing declarations. if (SemaRef.CheckTemplateParameterList(InstParams, PrevParams, - Sema::TPC_ClassTemplate)) + Sema::TPC_Other)) return nullptr; Inst->setAccess(PrevClassTemplate->getAccess()); diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c new file mode 100644 index 0000000000000..20e389d10c80f --- /dev/null +++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c @@ -0,0 +1,68 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5 +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s + +// Test that, depending on triple and, if applicable, target-cpu, one of three +// things happens: +// 1) for gfx900 we emit an empty kernel (concrete target, lacks feature) +// 2) for gfx1010 we emit a call to trap (concrete target, has feature) +// 3) for AMDGCNSPIRV we emit llvm.amdgcn.has.gfx10-insts as a constant +// externally initialised bool global, and load from it to provide the +// condition to a br (abstract target) + +//. +// AMDGCN-GFX900: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600 +//. +// AMDGCN-GFX1010: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600 +//. +// AMDGCNSPIRV: @llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1 +//. +// AMDGCN-GFX900-LABEL: define dso_local void @foo( +// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX900-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX900-NEXT: ret void +// +// AMDGCN-GFX1010-LABEL: define dso_local void @foo( +// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX1010-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX1010-NEXT: call void @llvm.trap() +// AMDGCN-GFX1010-NEXT: ret void +// +// AMDGCNSPIRV-LABEL: define spir_func void @foo( +// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] { +// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1 +// AMDGCNSPIRV-NEXT: [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +// AMDGCNSPIRV: [[IF_THEN]]: +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.trap() +// AMDGCNSPIRV-NEXT: br label %[[IF_END]] +// AMDGCNSPIRV: [[IF_END]]: +// AMDGCNSPIRV-NEXT: ret void +// +void foo() { + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16)) + return __builtin_trap(); +} +//. +// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +//. +// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } +// AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" } +// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c new file mode 100644 index 0000000000000..fae8f12f5c96b --- /dev/null +++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c @@ -0,0 +1,66 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5 +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s + +// Test that, depending on triple and, if applicable, target-cpu, one of three +// things happens: +// 1) for gfx900 we emit a call to trap (concrete target, matches) +// 2) for gfx1010 we emit an empty kernel (concrete target, does not match) +// 3) for AMDGCNSPIRV we emit llvm.amdgcn.is.gfx900 as a bool global, and +// load from it to provide the condition a br (abstract target) +//. +// AMDGCN-GFX900: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600 +//. +// AMDGCN-GFX1010: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600 +//. +// AMDGCNSPIRV: @llvm.amdgcn.is.gfx900 = external addrspace(1) externally_initialized constant i1 +//. +// AMDGCN-GFX900-LABEL: define dso_local void @foo( +// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX900-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX900-NEXT: call void @llvm.trap() +// AMDGCN-GFX900-NEXT: ret void +// +// AMDGCN-GFX1010-LABEL: define dso_local void @foo( +// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX1010-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX1010-NEXT: ret void +// +// AMDGCNSPIRV-LABEL: define spir_func void @foo( +// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] { +// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1 +// AMDGCNSPIRV-NEXT: [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +// AMDGCNSPIRV: [[IF_THEN]]: +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.trap() +// AMDGCNSPIRV-NEXT: br label %[[IF_END]] +// AMDGCNSPIRV: [[IF_END]]: +// AMDGCNSPIRV-NEXT: ret void +// +void foo() { + if (__builtin_amdgcn_processor_is("gfx900")) + return __builtin_trap(); +} +//. +// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +// AMDGCN-GFX900: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } +//. +// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" } +// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp new file mode 100644 index 0000000000000..10cfdab1b37cd --- /dev/null +++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp @@ -0,0 +1,48 @@ +// RUN: not %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - 2>&1 | FileCheck %s +// RUN: not %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - 2>&1 | FileCheck %s + +bool predicate(bool x); +void pass_by_value(__amdgpu_feature_predicate_t x); + +void invalid_uses(int *p, int x, const __amdgpu_feature_predicate_t &lv, + __amdgpu_feature_predicate_t &&rv) { + // CHECK: error: 'a' has type __amdgpu_feature_predicate_t, which is not constructible + __amdgpu_feature_predicate_t a; + // CHECK: error: 'b' has type __amdgpu_feature_predicate_t, which is not constructible + __amdgpu_feature_predicate_t b = __builtin_amdgcn_processor_is("gfx906"); + // CHECK: error: 'c' has type __amdgpu_feature_predicate_t, which is not constructible + __amdgpu_feature_predicate_t c = lv; + // CHECK: error: 'd' has type __amdgpu_feature_predicate_t, which is not constructible + __amdgpu_feature_predicate_t d = rv; + // CHECK: error: __builtin_amdgcn_processor_is("gfx906") must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided + bool invalid_use_in_init_0 = __builtin_amdgcn_processor_is("gfx906"); + // CHECK: error: 'x' has type __amdgpu_feature_predicate_t, which is not constructible + pass_by_value(__builtin_amdgcn_processor_is("gfx906")); + // CHECK: error: __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var) must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided + bool invalid_use_in_init_1 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + // CHECK: error: __builtin_amdgcn_processor_is("gfx906") must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided + if (bool invalid_use_in_init_2 = __builtin_amdgcn_processor_is("gfx906")) return; + // CHECK: error: __builtin_amdgcn_processor_is("gfx1200") must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided + if (predicate(__builtin_amdgcn_processor_is("gfx1200"))) __builtin_amdgcn_s_sleep_var(x); +} + +void invalid_invocations(int x, const char* str) { + // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a valid AMDGCN processor identifier; 'not_an_amdgcn_gfx_id' is not valid + // CHECK-DAG: note: valid AMDGCN processor identifiers are: {{.*}}gfx{{.*}} + if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return; + // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal + if (__builtin_amdgcn_processor_is(str)) return; + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}__builtin_amdgcn_s_sleep_var{{.*}} is not valid + if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return; + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}str{{.*}} is not valid + else if (__builtin_amdgcn_is_invocable(str)) return; + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}x{{.*}} is not valid + else if (__builtin_amdgcn_is_invocable(x)) return; + // CHECK: error: use of undeclared identifier '__builtin_ia32_pause' + else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return; +} + +bool return_needs_cast() { + // CHECK: error: __builtin_amdgcn_processor_is("gfx900") must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided + return __builtin_amdgcn_processor_is("gfx900"); +} diff --git a/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c b/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c index 447590dcc3804..7294d4c96e76c 100644 --- a/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c +++ b/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c @@ -1,6 +1,5 @@ // RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -fcuda-is-device -fembed-bitcode=marker -x hip %s -o - \ // RUN: | FileCheck %s --check-prefix=CHECK -// XFAIL: * // CHECK: @llvm.embedded.module = private addrspace(1) constant [0 x i8] zeroinitializer, section ".llvmbc", align 1 // CHECK-NEXT: @llvm.cmdline = private addrspace(1) constant [{{[0-9]+}} x i8] c"{{.*}}", section ".llvmcmd", align 1 // CHECK-NEXT: @llvm.compiler.used = appending addrspace(1) global [5 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) @foo.managed to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @foo to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.cmdline to ptr addrspace(4))], section "llvm.metadata" diff --git a/clang/test/CodeGen/link-builtin-bitcode.c b/clang/test/CodeGen/link-builtin-bitcode.c index 470180efa4247..963a3956ff808 100644 --- a/clang/test/CodeGen/link-builtin-bitcode.c +++ b/clang/test/CodeGen/link-builtin-bitcode.c @@ -44,6 +44,6 @@ int bar() { return no_attr() + attr_in_target() + attr_not_in_target() + attr_in // CHECK-SAME: () #[[ATTR_INCOMPATIBLE:[0-9]+]] { // CHECK: attributes #[[ATTR_BAR]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } -// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } -// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } -// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-gfx9-insts" } +// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64" } +// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64" } +// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts" } diff --git a/clang/test/CodeGen/logb_scalbn.c b/clang/test/CodeGen/logb_scalbn.c new file mode 100644 index 0000000000000..be5e68b5fd4b0 --- /dev/null +++ b/clang/test/CodeGen/logb_scalbn.c @@ -0,0 +1,1045 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang -cc1 -triple amdgcn-amd-amdhsa -o - -emit-llvm %s | FileCheck %s -check-prefixes=DEFAULT +// RUN: %clang -cc1 -triple amdgcn-amd-amdhsa -o - -ffp-exception-behavior=ignore -emit-llvm %s | FileCheck %s -check-prefixes=IGNORE +// RUN: %clang -cc1 -triple amdgcn-amd-amdhsa -o - -ffp-exception-behavior=strict -emit-llvm %s | FileCheck %s -check-prefixes=STRICT +// RUN: %clang -cc1 -triple amdgcn-amd-amdhsa -o - -ffp-exception-behavior=maytrap -emit-llvm %s | FileCheck %s -check-prefixes=MAYTRAP +// RUN: %clang -cc1 -triple amdgcn-amd-amdhsa -o - -fmath-errno -emit-llvm %s | FileCheck %s -check-prefixes=ERRNO + +// DEFAULT-LABEL: define dso_local void @test_logbf( +// DEFAULT-SAME: ) #[[ATTR0:[0-9]+]] { +// DEFAULT-NEXT: [[ENTRY:.*:]] +// DEFAULT-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// DEFAULT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// DEFAULT-NEXT: [[TMP0:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 0x40301999A0000000) +// DEFAULT-NEXT: [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1 +// DEFAULT-NEXT: [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1 +// DEFAULT-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float +// DEFAULT-NEXT: [[TMP4:%.*]] = call float @llvm.fabs.f32(float 0x40301999A0000000) +// DEFAULT-NEXT: [[TMP5:%.*]] = fcmp one float [[TMP4]], 0x7FF0000000000000 +// DEFAULT-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]] +// DEFAULT-NEXT: [[TMP7:%.*]] = select i1 false, float 0xFFF0000000000000, float [[TMP6]] +// DEFAULT-NEXT: store float [[TMP7]], ptr [[D1_ASCAST]], align 4 +// DEFAULT-NEXT: ret void +// +// IGNORE-LABEL: define dso_local void @test_logbf( +// IGNORE-SAME: ) #[[ATTR0:[0-9]+]] { +// IGNORE-NEXT: [[ENTRY:.*:]] +// IGNORE-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// IGNORE-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// IGNORE-NEXT: [[TMP0:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 0x40301999A0000000) +// IGNORE-NEXT: [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1 +// IGNORE-NEXT: [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1 +// IGNORE-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float +// IGNORE-NEXT: [[TMP4:%.*]] = call float @llvm.fabs.f32(float 0x40301999A0000000) +// IGNORE-NEXT: [[TMP5:%.*]] = fcmp one float [[TMP4]], 0x7FF0000000000000 +// IGNORE-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]] +// IGNORE-NEXT: [[TMP7:%.*]] = select i1 false, float 0xFFF0000000000000, float [[TMP6]] +// IGNORE-NEXT: store float [[TMP7]], ptr [[D1_ASCAST]], align 4 +// IGNORE-NEXT: ret void +// +// STRICT-LABEL: define dso_local void @test_logbf( +// STRICT-SAME: ) #[[ATTR0:[0-9]+]] { +// STRICT-NEXT: [[ENTRY:.*:]] +// STRICT-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// STRICT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// STRICT-NEXT: [[TMP0:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 0x40301999A0000000) +// STRICT-NEXT: [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1 +// STRICT-NEXT: [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1 +// STRICT-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float +// STRICT-NEXT: [[TMP4:%.*]] = call float @llvm.fabs.f32(float 0x40301999A0000000) +// STRICT-NEXT: [[TMP5:%.*]] = fcmp one float [[TMP4]], 0x7FF0000000000000 +// STRICT-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]] +// STRICT-NEXT: [[TMP7:%.*]] = select i1 false, float 0xFFF0000000000000, float [[TMP6]] +// STRICT-NEXT: store float [[TMP7]], ptr [[D1_ASCAST]], align 4 +// STRICT-NEXT: ret void +// +// MAYTRAP-LABEL: define dso_local void @test_logbf( +// MAYTRAP-SAME: ) #[[ATTR0:[0-9]+]] { +// MAYTRAP-NEXT: [[ENTRY:.*:]] +// MAYTRAP-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// MAYTRAP-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// MAYTRAP-NEXT: [[TMP0:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 0x40301999A0000000) +// MAYTRAP-NEXT: [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1 +// MAYTRAP-NEXT: [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1 +// MAYTRAP-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float +// MAYTRAP-NEXT: [[TMP4:%.*]] = call float @llvm.fabs.f32(float 0x40301999A0000000) +// MAYTRAP-NEXT: [[TMP5:%.*]] = fcmp one float [[TMP4]], 0x7FF0000000000000 +// MAYTRAP-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]] +// MAYTRAP-NEXT: [[TMP7:%.*]] = select i1 false, float 0xFFF0000000000000, float [[TMP6]] +// MAYTRAP-NEXT: store float [[TMP7]], ptr [[D1_ASCAST]], align 4 +// MAYTRAP-NEXT: ret void +// +// ERRNO-LABEL: define dso_local void @test_logbf( +// ERRNO-SAME: ) #[[ATTR0:[0-9]+]] { +// ERRNO-NEXT: [[ENTRY:.*:]] +// ERRNO-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// ERRNO-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// ERRNO-NEXT: [[CALL:%.*]] = call float @logbf(float noundef 0x40301999A0000000) #[[ATTR2:[0-9]+]] +// ERRNO-NEXT: store float [[CALL]], ptr [[D1_ASCAST]], align 4 +// ERRNO-NEXT: ret void +// +void test_logbf() { + float D1 = __builtin_logbf(16.1f); +} +// DEFAULT-LABEL: define dso_local void @test_logbf_var( +// DEFAULT-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// DEFAULT-NEXT: [[ENTRY:.*:]] +// DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// DEFAULT-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// DEFAULT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// DEFAULT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// DEFAULT-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP0]]) +// DEFAULT-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +// DEFAULT-NEXT: [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1 +// DEFAULT-NEXT: [[TMP4:%.*]] = sitofp i32 [[TMP3]] to float +// DEFAULT-NEXT: [[TMP5:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP6:%.*]] = call float @llvm.fabs.f32(float [[TMP5]]) +// DEFAULT-NEXT: [[TMP7:%.*]] = fcmp one float [[TMP6]], 0x7FF0000000000000 +// DEFAULT-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP4]], float [[TMP6]] +// DEFAULT-NEXT: [[TMP9:%.*]] = fcmp oeq float [[TMP0]], 0.000000e+00 +// DEFAULT-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[TMP8]] +// DEFAULT-NEXT: store float [[TMP10]], ptr [[D1_ASCAST]], align 4 +// DEFAULT-NEXT: ret void +// +// IGNORE-LABEL: define dso_local void @test_logbf_var( +// IGNORE-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// IGNORE-NEXT: [[ENTRY:.*:]] +// IGNORE-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// IGNORE-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// IGNORE-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// IGNORE-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// IGNORE-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP0]]) +// IGNORE-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +// IGNORE-NEXT: [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1 +// IGNORE-NEXT: [[TMP4:%.*]] = sitofp i32 [[TMP3]] to float +// IGNORE-NEXT: [[TMP5:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP6:%.*]] = call float @llvm.fabs.f32(float [[TMP5]]) +// IGNORE-NEXT: [[TMP7:%.*]] = fcmp one float [[TMP6]], 0x7FF0000000000000 +// IGNORE-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP4]], float [[TMP6]] +// IGNORE-NEXT: [[TMP9:%.*]] = fcmp oeq float [[TMP0]], 0.000000e+00 +// IGNORE-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[TMP8]] +// IGNORE-NEXT: store float [[TMP10]], ptr [[D1_ASCAST]], align 4 +// IGNORE-NEXT: ret void +// +// STRICT-LABEL: define dso_local void @test_logbf_var( +// STRICT-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// STRICT-NEXT: [[ENTRY:.*:]] +// STRICT-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// STRICT-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// STRICT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// STRICT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// STRICT-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP0]]) +// STRICT-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +// STRICT-NEXT: [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1 +// STRICT-NEXT: [[TMP4:%.*]] = sitofp i32 [[TMP3]] to float +// STRICT-NEXT: [[TMP5:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP6:%.*]] = call float @llvm.fabs.f32(float [[TMP5]]) +// STRICT-NEXT: [[TMP7:%.*]] = fcmp one float [[TMP6]], 0x7FF0000000000000 +// STRICT-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP4]], float [[TMP6]] +// STRICT-NEXT: [[TMP9:%.*]] = fcmp oeq float [[TMP0]], 0.000000e+00 +// STRICT-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[TMP8]] +// STRICT-NEXT: store float [[TMP10]], ptr [[D1_ASCAST]], align 4 +// STRICT-NEXT: ret void +// +// MAYTRAP-LABEL: define dso_local void @test_logbf_var( +// MAYTRAP-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// MAYTRAP-NEXT: [[ENTRY:.*:]] +// MAYTRAP-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// MAYTRAP-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// MAYTRAP-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// MAYTRAP-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// MAYTRAP-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP0]]) +// MAYTRAP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +// MAYTRAP-NEXT: [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1 +// MAYTRAP-NEXT: [[TMP4:%.*]] = sitofp i32 [[TMP3]] to float +// MAYTRAP-NEXT: [[TMP5:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP6:%.*]] = call float @llvm.fabs.f32(float [[TMP5]]) +// MAYTRAP-NEXT: [[TMP7:%.*]] = fcmp one float [[TMP6]], 0x7FF0000000000000 +// MAYTRAP-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP4]], float [[TMP6]] +// MAYTRAP-NEXT: [[TMP9:%.*]] = fcmp oeq float [[TMP0]], 0.000000e+00 +// MAYTRAP-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[TMP8]] +// MAYTRAP-NEXT: store float [[TMP10]], ptr [[D1_ASCAST]], align 4 +// MAYTRAP-NEXT: ret void +// +// ERRNO-LABEL: define dso_local void @test_logbf_var( +// ERRNO-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// ERRNO-NEXT: [[ENTRY:.*:]] +// ERRNO-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// ERRNO-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// ERRNO-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// ERRNO-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// ERRNO-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: [[CALL:%.*]] = call float @logbf(float noundef [[TMP0]]) #[[ATTR2]] +// ERRNO-NEXT: store float [[CALL]], ptr [[D1_ASCAST]], align 4 +// ERRNO-NEXT: ret void +// +void test_logbf_var(float a) { + float D1 = __builtin_logbf(a); +} +// CHECK-LABEL: define dso_local void @test_logb( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// CHECK-NEXT: [[TMP0:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double 1.510000e+01) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1 +// CHECK-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to double +// CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fabs.f64(double 1.510000e+01) +// CHECK-NEXT: [[TMP5:%.*]] = fcmp one double [[TMP4]], 0x7FF0000000000000 +// CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP4]] +// CHECK-NEXT: [[TMP7:%.*]] = select i1 false, double 0xFFF0000000000000, double [[TMP6]] +// CHECK-NEXT: store double [[TMP7]], ptr [[D1_ASCAST]], align 8 +// CHECK-NEXT: ret void +// DEFAULT-LABEL: define dso_local void @test_logb( +// DEFAULT-SAME: ) #[[ATTR0]] { +// DEFAULT-NEXT: [[ENTRY:.*:]] +// DEFAULT-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// DEFAULT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// DEFAULT-NEXT: [[TMP0:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double 1.510000e+01) +// DEFAULT-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 +// DEFAULT-NEXT: [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1 +// DEFAULT-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to double +// DEFAULT-NEXT: [[TMP4:%.*]] = call double @llvm.fabs.f64(double 1.510000e+01) +// DEFAULT-NEXT: [[TMP5:%.*]] = fcmp one double [[TMP4]], 0x7FF0000000000000 +// DEFAULT-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP4]] +// DEFAULT-NEXT: [[TMP7:%.*]] = select i1 false, double 0xFFF0000000000000, double [[TMP6]] +// DEFAULT-NEXT: store double [[TMP7]], ptr [[D1_ASCAST]], align 8 +// DEFAULT-NEXT: ret void +// +// IGNORE-LABEL: define dso_local void @test_logb( +// IGNORE-SAME: ) #[[ATTR0]] { +// IGNORE-NEXT: [[ENTRY:.*:]] +// IGNORE-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// IGNORE-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// IGNORE-NEXT: [[TMP0:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double 1.510000e+01) +// IGNORE-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 +// IGNORE-NEXT: [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1 +// IGNORE-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to double +// IGNORE-NEXT: [[TMP4:%.*]] = call double @llvm.fabs.f64(double 1.510000e+01) +// IGNORE-NEXT: [[TMP5:%.*]] = fcmp one double [[TMP4]], 0x7FF0000000000000 +// IGNORE-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP4]] +// IGNORE-NEXT: [[TMP7:%.*]] = select i1 false, double 0xFFF0000000000000, double [[TMP6]] +// IGNORE-NEXT: store double [[TMP7]], ptr [[D1_ASCAST]], align 8 +// IGNORE-NEXT: ret void +// +// STRICT-LABEL: define dso_local void @test_logb( +// STRICT-SAME: ) #[[ATTR0]] { +// STRICT-NEXT: [[ENTRY:.*:]] +// STRICT-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// STRICT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// STRICT-NEXT: [[TMP0:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double 1.510000e+01) +// STRICT-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 +// STRICT-NEXT: [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1 +// STRICT-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to double +// STRICT-NEXT: [[TMP4:%.*]] = call double @llvm.fabs.f64(double 1.510000e+01) +// STRICT-NEXT: [[TMP5:%.*]] = fcmp one double [[TMP4]], 0x7FF0000000000000 +// STRICT-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP4]] +// STRICT-NEXT: [[TMP7:%.*]] = select i1 false, double 0xFFF0000000000000, double [[TMP6]] +// STRICT-NEXT: store double [[TMP7]], ptr [[D1_ASCAST]], align 8 +// STRICT-NEXT: ret void +// +// MAYTRAP-LABEL: define dso_local void @test_logb( +// MAYTRAP-SAME: ) #[[ATTR0]] { +// MAYTRAP-NEXT: [[ENTRY:.*:]] +// MAYTRAP-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// MAYTRAP-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// MAYTRAP-NEXT: [[TMP0:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double 1.510000e+01) +// MAYTRAP-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 +// MAYTRAP-NEXT: [[TMP2:%.*]] = add nsw i32 [[TMP1]], -1 +// MAYTRAP-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to double +// MAYTRAP-NEXT: [[TMP4:%.*]] = call double @llvm.fabs.f64(double 1.510000e+01) +// MAYTRAP-NEXT: [[TMP5:%.*]] = fcmp one double [[TMP4]], 0x7FF0000000000000 +// MAYTRAP-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP4]] +// MAYTRAP-NEXT: [[TMP7:%.*]] = select i1 false, double 0xFFF0000000000000, double [[TMP6]] +// MAYTRAP-NEXT: store double [[TMP7]], ptr [[D1_ASCAST]], align 8 +// MAYTRAP-NEXT: ret void +// +// ERRNO-LABEL: define dso_local void @test_logb( +// ERRNO-SAME: ) #[[ATTR0]] { +// ERRNO-NEXT: [[ENTRY:.*:]] +// ERRNO-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// ERRNO-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// ERRNO-NEXT: [[CALL:%.*]] = call double @logb(double noundef 1.510000e+01) #[[ATTR2]] +// ERRNO-NEXT: store double [[CALL]], ptr [[D1_ASCAST]], align 8 +// ERRNO-NEXT: ret void +// +void test_logb() { + double D1 = __builtin_logb(15.1); +} +// CHECK-LABEL: define dso_local void @test_logb_var( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// CHECK-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP1]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1 +// CHECK-NEXT: [[TMP4:%.*]] = sitofp i32 [[TMP3]] to double +// CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fabs.f64(double [[TMP5]]) +// CHECK-NEXT: [[TMP7:%.*]] = fcmp one double [[TMP6]], 0x7FF0000000000000 +// CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP4]], double [[TMP6]] +// CHECK-NEXT: [[TMP9:%.*]] = fcmp oeq double [[TMP0]], 0.000000e+00 +// CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[TMP8]] +// CHECK-NEXT: store double [[TMP10]], ptr [[D1_ASCAST]], align 8 +// CHECK-NEXT: ret void +// DEFAULT-LABEL: define dso_local void @test_logb_var( +// DEFAULT-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// DEFAULT-NEXT: [[ENTRY:.*:]] +// DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// DEFAULT-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// DEFAULT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// DEFAULT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// DEFAULT-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// DEFAULT-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// DEFAULT-NEXT: [[TMP1:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[TMP0]]) +// DEFAULT-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP1]], 1 +// DEFAULT-NEXT: [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1 +// DEFAULT-NEXT: [[TMP4:%.*]] = sitofp i32 [[TMP3]] to double +// DEFAULT-NEXT: [[TMP5:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// DEFAULT-NEXT: [[TMP6:%.*]] = call double @llvm.fabs.f64(double [[TMP5]]) +// DEFAULT-NEXT: [[TMP7:%.*]] = fcmp one double [[TMP6]], 0x7FF0000000000000 +// DEFAULT-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP4]], double [[TMP6]] +// DEFAULT-NEXT: [[TMP9:%.*]] = fcmp oeq double [[TMP0]], 0.000000e+00 +// DEFAULT-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[TMP8]] +// DEFAULT-NEXT: store double [[TMP10]], ptr [[D1_ASCAST]], align 8 +// DEFAULT-NEXT: ret void +// +// IGNORE-LABEL: define dso_local void @test_logb_var( +// IGNORE-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// IGNORE-NEXT: [[ENTRY:.*:]] +// IGNORE-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// IGNORE-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// IGNORE-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// IGNORE-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// IGNORE-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// IGNORE-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// IGNORE-NEXT: [[TMP1:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[TMP0]]) +// IGNORE-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP1]], 1 +// IGNORE-NEXT: [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1 +// IGNORE-NEXT: [[TMP4:%.*]] = sitofp i32 [[TMP3]] to double +// IGNORE-NEXT: [[TMP5:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// IGNORE-NEXT: [[TMP6:%.*]] = call double @llvm.fabs.f64(double [[TMP5]]) +// IGNORE-NEXT: [[TMP7:%.*]] = fcmp one double [[TMP6]], 0x7FF0000000000000 +// IGNORE-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP4]], double [[TMP6]] +// IGNORE-NEXT: [[TMP9:%.*]] = fcmp oeq double [[TMP0]], 0.000000e+00 +// IGNORE-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[TMP8]] +// IGNORE-NEXT: store double [[TMP10]], ptr [[D1_ASCAST]], align 8 +// IGNORE-NEXT: ret void +// +// STRICT-LABEL: define dso_local void @test_logb_var( +// STRICT-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// STRICT-NEXT: [[ENTRY:.*:]] +// STRICT-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// STRICT-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// STRICT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// STRICT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// STRICT-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// STRICT-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// STRICT-NEXT: [[TMP1:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[TMP0]]) +// STRICT-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP1]], 1 +// STRICT-NEXT: [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1 +// STRICT-NEXT: [[TMP4:%.*]] = sitofp i32 [[TMP3]] to double +// STRICT-NEXT: [[TMP5:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// STRICT-NEXT: [[TMP6:%.*]] = call double @llvm.fabs.f64(double [[TMP5]]) +// STRICT-NEXT: [[TMP7:%.*]] = fcmp one double [[TMP6]], 0x7FF0000000000000 +// STRICT-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP4]], double [[TMP6]] +// STRICT-NEXT: [[TMP9:%.*]] = fcmp oeq double [[TMP0]], 0.000000e+00 +// STRICT-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[TMP8]] +// STRICT-NEXT: store double [[TMP10]], ptr [[D1_ASCAST]], align 8 +// STRICT-NEXT: ret void +// +// MAYTRAP-LABEL: define dso_local void @test_logb_var( +// MAYTRAP-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// MAYTRAP-NEXT: [[ENTRY:.*:]] +// MAYTRAP-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// MAYTRAP-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// MAYTRAP-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// MAYTRAP-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// MAYTRAP-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// MAYTRAP-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// MAYTRAP-NEXT: [[TMP1:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[TMP0]]) +// MAYTRAP-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP1]], 1 +// MAYTRAP-NEXT: [[TMP3:%.*]] = add nsw i32 [[TMP2]], -1 +// MAYTRAP-NEXT: [[TMP4:%.*]] = sitofp i32 [[TMP3]] to double +// MAYTRAP-NEXT: [[TMP5:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// MAYTRAP-NEXT: [[TMP6:%.*]] = call double @llvm.fabs.f64(double [[TMP5]]) +// MAYTRAP-NEXT: [[TMP7:%.*]] = fcmp one double [[TMP6]], 0x7FF0000000000000 +// MAYTRAP-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP4]], double [[TMP6]] +// MAYTRAP-NEXT: [[TMP9:%.*]] = fcmp oeq double [[TMP0]], 0.000000e+00 +// MAYTRAP-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[TMP8]] +// MAYTRAP-NEXT: store double [[TMP10]], ptr [[D1_ASCAST]], align 8 +// MAYTRAP-NEXT: ret void +// +// ERRNO-LABEL: define dso_local void @test_logb_var( +// ERRNO-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// ERRNO-NEXT: [[ENTRY:.*:]] +// ERRNO-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// ERRNO-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// ERRNO-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// ERRNO-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// ERRNO-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// ERRNO-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// ERRNO-NEXT: [[CALL:%.*]] = call double @logb(double noundef [[TMP0]]) #[[ATTR2]] +// ERRNO-NEXT: store double [[CALL]], ptr [[D1_ASCAST]], align 8 +// ERRNO-NEXT: ret void +// +void test_logb_var(double a) { + double D1 = __builtin_logb(a); +} + +// CHECK-LABEL: define dso_local void @test_scalbnf( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.ldexp.f32.i32(float 0x4030B33340000000, i32 10) +// CHECK-NEXT: store float [[TMP0]], ptr [[D1_ASCAST]], align 4 +// CHECK-NEXT: ret void +// DEFAULT-LABEL: define dso_local void @test_scalbnf( +// DEFAULT-SAME: ) #[[ATTR0]] { +// DEFAULT-NEXT: [[ENTRY:.*:]] +// DEFAULT-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// DEFAULT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// DEFAULT-NEXT: [[TMP0:%.*]] = call float @llvm.ldexp.f32.i32(float 0x4030B33340000000, i32 10) +// DEFAULT-NEXT: store float [[TMP0]], ptr [[D1_ASCAST]], align 4 +// DEFAULT-NEXT: ret void +// +// IGNORE-LABEL: define dso_local void @test_scalbnf( +// IGNORE-SAME: ) #[[ATTR0]] { +// IGNORE-NEXT: [[ENTRY:.*:]] +// IGNORE-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// IGNORE-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// IGNORE-NEXT: [[TMP0:%.*]] = call float @llvm.ldexp.f32.i32(float 0x4030B33340000000, i32 10) +// IGNORE-NEXT: store float [[TMP0]], ptr [[D1_ASCAST]], align 4 +// IGNORE-NEXT: ret void +// +// STRICT-LABEL: define dso_local void @test_scalbnf( +// STRICT-SAME: ) #[[ATTR0]] { +// STRICT-NEXT: [[ENTRY:.*:]] +// STRICT-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// STRICT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// STRICT-NEXT: [[TMP0:%.*]] = call float @llvm.ldexp.f32.i32(float 0x4030B33340000000, i32 10) +// STRICT-NEXT: store float [[TMP0]], ptr [[D1_ASCAST]], align 4 +// STRICT-NEXT: ret void +// +// MAYTRAP-LABEL: define dso_local void @test_scalbnf( +// MAYTRAP-SAME: ) #[[ATTR0]] { +// MAYTRAP-NEXT: [[ENTRY:.*:]] +// MAYTRAP-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// MAYTRAP-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// MAYTRAP-NEXT: [[TMP0:%.*]] = call float @llvm.ldexp.f32.i32(float 0x4030B33340000000, i32 10) +// MAYTRAP-NEXT: store float [[TMP0]], ptr [[D1_ASCAST]], align 4 +// MAYTRAP-NEXT: ret void +// +// ERRNO-LABEL: define dso_local void @test_scalbnf( +// ERRNO-SAME: ) #[[ATTR0]] { +// ERRNO-NEXT: [[ENTRY:.*:]] +// ERRNO-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// ERRNO-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// ERRNO-NEXT: [[CALL:%.*]] = call float @scalbnf(float noundef 0x4030B33340000000, i32 noundef 10) #[[ATTR2]] +// ERRNO-NEXT: store float [[CALL]], ptr [[D1_ASCAST]], align 4 +// ERRNO-NEXT: ret void +// +void test_scalbnf() { + float D1 = __builtin_scalbnf(16.7f, 10); +} +// CHECK-LABEL: define dso_local void @test_scalbnf_var1( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// CHECK-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP0]], i32 9) +// CHECK-NEXT: store float [[TMP1]], ptr [[D1_ASCAST]], align 4 +// CHECK-NEXT: ret void +// DEFAULT-LABEL: define dso_local void @test_scalbnf_var1( +// DEFAULT-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// DEFAULT-NEXT: [[ENTRY:.*:]] +// DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// DEFAULT-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// DEFAULT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// DEFAULT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// DEFAULT-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP0]], i32 9) +// DEFAULT-NEXT: store float [[TMP1]], ptr [[D1_ASCAST]], align 4 +// DEFAULT-NEXT: ret void +// +// IGNORE-LABEL: define dso_local void @test_scalbnf_var1( +// IGNORE-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// IGNORE-NEXT: [[ENTRY:.*:]] +// IGNORE-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// IGNORE-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// IGNORE-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// IGNORE-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// IGNORE-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP0]], i32 9) +// IGNORE-NEXT: store float [[TMP1]], ptr [[D1_ASCAST]], align 4 +// IGNORE-NEXT: ret void +// +// STRICT-LABEL: define dso_local void @test_scalbnf_var1( +// STRICT-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// STRICT-NEXT: [[ENTRY:.*:]] +// STRICT-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// STRICT-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// STRICT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// STRICT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// STRICT-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP0]], i32 9) +// STRICT-NEXT: store float [[TMP1]], ptr [[D1_ASCAST]], align 4 +// STRICT-NEXT: ret void +// +// MAYTRAP-LABEL: define dso_local void @test_scalbnf_var1( +// MAYTRAP-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// MAYTRAP-NEXT: [[ENTRY:.*:]] +// MAYTRAP-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// MAYTRAP-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// MAYTRAP-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// MAYTRAP-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// MAYTRAP-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP0]], i32 9) +// MAYTRAP-NEXT: store float [[TMP1]], ptr [[D1_ASCAST]], align 4 +// MAYTRAP-NEXT: ret void +// +// ERRNO-LABEL: define dso_local void @test_scalbnf_var1( +// ERRNO-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// ERRNO-NEXT: [[ENTRY:.*:]] +// ERRNO-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// ERRNO-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// ERRNO-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// ERRNO-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// ERRNO-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: [[CALL:%.*]] = call float @scalbnf(float noundef [[TMP0]], i32 noundef 9) #[[ATTR2]] +// ERRNO-NEXT: store float [[CALL]], ptr [[D1_ASCAST]], align 4 +// ERRNO-NEXT: ret void +// +void test_scalbnf_var1(float a) { + float D1 = __builtin_scalbnf(a, 9); +} +// CHECK-LABEL: define dso_local void @test_scalbnf_var2( +// CHECK-SAME: i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// CHECK-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.ldexp.f32.i32(float 0x402E666660000000, i32 [[TMP0]]) +// CHECK-NEXT: store float [[TMP1]], ptr [[D1_ASCAST]], align 4 +// CHECK-NEXT: ret void +// DEFAULT-LABEL: define dso_local void @test_scalbnf_var2( +// DEFAULT-SAME: i32 noundef [[B:%.*]]) #[[ATTR0]] { +// DEFAULT-NEXT: [[ENTRY:.*:]] +// DEFAULT-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// DEFAULT-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// DEFAULT-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// DEFAULT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// DEFAULT-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP1:%.*]] = call float @llvm.ldexp.f32.i32(float 0x402E666660000000, i32 [[TMP0]]) +// DEFAULT-NEXT: store float [[TMP1]], ptr [[D1_ASCAST]], align 4 +// DEFAULT-NEXT: ret void +// +// IGNORE-LABEL: define dso_local void @test_scalbnf_var2( +// IGNORE-SAME: i32 noundef [[B:%.*]]) #[[ATTR0]] { +// IGNORE-NEXT: [[ENTRY:.*:]] +// IGNORE-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// IGNORE-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// IGNORE-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// IGNORE-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// IGNORE-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP0:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP1:%.*]] = call float @llvm.ldexp.f32.i32(float 0x402E666660000000, i32 [[TMP0]]) +// IGNORE-NEXT: store float [[TMP1]], ptr [[D1_ASCAST]], align 4 +// IGNORE-NEXT: ret void +// +// STRICT-LABEL: define dso_local void @test_scalbnf_var2( +// STRICT-SAME: i32 noundef [[B:%.*]]) #[[ATTR0]] { +// STRICT-NEXT: [[ENTRY:.*:]] +// STRICT-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// STRICT-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// STRICT-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// STRICT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// STRICT-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP0:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP1:%.*]] = call float @llvm.ldexp.f32.i32(float 0x402E666660000000, i32 [[TMP0]]) +// STRICT-NEXT: store float [[TMP1]], ptr [[D1_ASCAST]], align 4 +// STRICT-NEXT: ret void +// +// MAYTRAP-LABEL: define dso_local void @test_scalbnf_var2( +// MAYTRAP-SAME: i32 noundef [[B:%.*]]) #[[ATTR0]] { +// MAYTRAP-NEXT: [[ENTRY:.*:]] +// MAYTRAP-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// MAYTRAP-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// MAYTRAP-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// MAYTRAP-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// MAYTRAP-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP0:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP1:%.*]] = call float @llvm.ldexp.f32.i32(float 0x402E666660000000, i32 [[TMP0]]) +// MAYTRAP-NEXT: store float [[TMP1]], ptr [[D1_ASCAST]], align 4 +// MAYTRAP-NEXT: ret void +// +// ERRNO-LABEL: define dso_local void @test_scalbnf_var2( +// ERRNO-SAME: i32 noundef [[B:%.*]]) #[[ATTR0]] { +// ERRNO-NEXT: [[ENTRY:.*:]] +// ERRNO-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// ERRNO-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// ERRNO-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// ERRNO-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// ERRNO-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: [[TMP0:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: [[CALL:%.*]] = call float @scalbnf(float noundef 0x402E666660000000, i32 noundef [[TMP0]]) #[[ATTR2]] +// ERRNO-NEXT: store float [[CALL]], ptr [[D1_ASCAST]], align 4 +// ERRNO-NEXT: ret void +// +void test_scalbnf_var2(int b) { + float D1 = __builtin_scalbnf(15.2f, b); +} +// CHECK-LABEL: define dso_local void @test_scalbnf_var3( +// CHECK-SAME: float noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// CHECK-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP0]], i32 [[TMP1]]) +// CHECK-NEXT: store float [[TMP2]], ptr [[D1_ASCAST]], align 4 +// CHECK-NEXT: ret void +// DEFAULT-LABEL: define dso_local void @test_scalbnf_var3( +// DEFAULT-SAME: float noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// DEFAULT-NEXT: [[ENTRY:.*:]] +// DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// DEFAULT-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// DEFAULT-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// DEFAULT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// DEFAULT-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// DEFAULT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// DEFAULT-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP2:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP0]], i32 [[TMP1]]) +// DEFAULT-NEXT: store float [[TMP2]], ptr [[D1_ASCAST]], align 4 +// DEFAULT-NEXT: ret void +// +// IGNORE-LABEL: define dso_local void @test_scalbnf_var3( +// IGNORE-SAME: float noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// IGNORE-NEXT: [[ENTRY:.*:]] +// IGNORE-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// IGNORE-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// IGNORE-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// IGNORE-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// IGNORE-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// IGNORE-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// IGNORE-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP2:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP0]], i32 [[TMP1]]) +// IGNORE-NEXT: store float [[TMP2]], ptr [[D1_ASCAST]], align 4 +// IGNORE-NEXT: ret void +// +// STRICT-LABEL: define dso_local void @test_scalbnf_var3( +// STRICT-SAME: float noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// STRICT-NEXT: [[ENTRY:.*:]] +// STRICT-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// STRICT-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// STRICT-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// STRICT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// STRICT-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// STRICT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// STRICT-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// STRICT-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP2:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP0]], i32 [[TMP1]]) +// STRICT-NEXT: store float [[TMP2]], ptr [[D1_ASCAST]], align 4 +// STRICT-NEXT: ret void +// +// MAYTRAP-LABEL: define dso_local void @test_scalbnf_var3( +// MAYTRAP-SAME: float noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// MAYTRAP-NEXT: [[ENTRY:.*:]] +// MAYTRAP-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// MAYTRAP-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// MAYTRAP-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// MAYTRAP-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// MAYTRAP-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// MAYTRAP-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// MAYTRAP-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP2:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP0]], i32 [[TMP1]]) +// MAYTRAP-NEXT: store float [[TMP2]], ptr [[D1_ASCAST]], align 4 +// MAYTRAP-NEXT: ret void +// +// ERRNO-LABEL: define dso_local void @test_scalbnf_var3( +// ERRNO-SAME: float noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// ERRNO-NEXT: [[ENTRY:.*:]] +// ERRNO-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// ERRNO-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// ERRNO-NEXT: [[D1:%.*]] = alloca float, align 4, addrspace(5) +// ERRNO-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// ERRNO-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// ERRNO-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// ERRNO-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: [[CALL:%.*]] = call float @scalbnf(float noundef [[TMP0]], i32 noundef [[TMP1]]) #[[ATTR2]] +// ERRNO-NEXT: store float [[CALL]], ptr [[D1_ASCAST]], align 4 +// ERRNO-NEXT: ret void +// +void test_scalbnf_var3(float a, int b) { + float D1 = __builtin_scalbnf(a, b); +} + +// CHECK-LABEL: define dso_local void @test_scalbn( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.ldexp.f64.i32(double 1.720000e+01, i32 10) +// CHECK-NEXT: store double [[TMP0]], ptr [[D1_ASCAST]], align 8 +// CHECK-NEXT: ret void +// DEFAULT-LABEL: define dso_local void @test_scalbn( +// DEFAULT-SAME: ) #[[ATTR0]] { +// DEFAULT-NEXT: [[ENTRY:.*:]] +// DEFAULT-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// DEFAULT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// DEFAULT-NEXT: [[TMP0:%.*]] = call double @llvm.ldexp.f64.i32(double 1.720000e+01, i32 10) +// DEFAULT-NEXT: store double [[TMP0]], ptr [[D1_ASCAST]], align 8 +// DEFAULT-NEXT: ret void +// +// IGNORE-LABEL: define dso_local void @test_scalbn( +// IGNORE-SAME: ) #[[ATTR0]] { +// IGNORE-NEXT: [[ENTRY:.*:]] +// IGNORE-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// IGNORE-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// IGNORE-NEXT: [[TMP0:%.*]] = call double @llvm.ldexp.f64.i32(double 1.720000e+01, i32 10) +// IGNORE-NEXT: store double [[TMP0]], ptr [[D1_ASCAST]], align 8 +// IGNORE-NEXT: ret void +// +// STRICT-LABEL: define dso_local void @test_scalbn( +// STRICT-SAME: ) #[[ATTR0]] { +// STRICT-NEXT: [[ENTRY:.*:]] +// STRICT-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// STRICT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// STRICT-NEXT: [[TMP0:%.*]] = call double @llvm.ldexp.f64.i32(double 1.720000e+01, i32 10) +// STRICT-NEXT: store double [[TMP0]], ptr [[D1_ASCAST]], align 8 +// STRICT-NEXT: ret void +// +// MAYTRAP-LABEL: define dso_local void @test_scalbn( +// MAYTRAP-SAME: ) #[[ATTR0]] { +// MAYTRAP-NEXT: [[ENTRY:.*:]] +// MAYTRAP-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// MAYTRAP-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// MAYTRAP-NEXT: [[TMP0:%.*]] = call double @llvm.ldexp.f64.i32(double 1.720000e+01, i32 10) +// MAYTRAP-NEXT: store double [[TMP0]], ptr [[D1_ASCAST]], align 8 +// MAYTRAP-NEXT: ret void +// +// ERRNO-LABEL: define dso_local void @test_scalbn( +// ERRNO-SAME: ) #[[ATTR0]] { +// ERRNO-NEXT: [[ENTRY:.*:]] +// ERRNO-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// ERRNO-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// ERRNO-NEXT: [[CALL:%.*]] = call double @scalbn(double noundef 1.720000e+01, i32 noundef 10) #[[ATTR2]] +// ERRNO-NEXT: store double [[CALL]], ptr [[D1_ASCAST]], align 8 +// ERRNO-NEXT: ret void +// +void test_scalbn() { + double D1 = __builtin_scalbn(17.2, 10); +} +// CHECK-LABEL: define dso_local void @test_scalbn_var1( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// CHECK-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP0]], i32 9) +// CHECK-NEXT: store double [[TMP1]], ptr [[D1_ASCAST]], align 8 +// CHECK-NEXT: ret void +// DEFAULT-LABEL: define dso_local void @test_scalbn_var1( +// DEFAULT-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// DEFAULT-NEXT: [[ENTRY:.*:]] +// DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// DEFAULT-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// DEFAULT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// DEFAULT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// DEFAULT-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// DEFAULT-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// DEFAULT-NEXT: [[TMP1:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP0]], i32 9) +// DEFAULT-NEXT: store double [[TMP1]], ptr [[D1_ASCAST]], align 8 +// DEFAULT-NEXT: ret void +// +// IGNORE-LABEL: define dso_local void @test_scalbn_var1( +// IGNORE-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// IGNORE-NEXT: [[ENTRY:.*:]] +// IGNORE-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// IGNORE-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// IGNORE-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// IGNORE-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// IGNORE-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// IGNORE-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// IGNORE-NEXT: [[TMP1:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP0]], i32 9) +// IGNORE-NEXT: store double [[TMP1]], ptr [[D1_ASCAST]], align 8 +// IGNORE-NEXT: ret void +// +// STRICT-LABEL: define dso_local void @test_scalbn_var1( +// STRICT-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// STRICT-NEXT: [[ENTRY:.*:]] +// STRICT-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// STRICT-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// STRICT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// STRICT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// STRICT-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// STRICT-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// STRICT-NEXT: [[TMP1:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP0]], i32 9) +// STRICT-NEXT: store double [[TMP1]], ptr [[D1_ASCAST]], align 8 +// STRICT-NEXT: ret void +// +// MAYTRAP-LABEL: define dso_local void @test_scalbn_var1( +// MAYTRAP-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// MAYTRAP-NEXT: [[ENTRY:.*:]] +// MAYTRAP-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// MAYTRAP-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// MAYTRAP-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// MAYTRAP-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// MAYTRAP-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// MAYTRAP-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// MAYTRAP-NEXT: [[TMP1:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP0]], i32 9) +// MAYTRAP-NEXT: store double [[TMP1]], ptr [[D1_ASCAST]], align 8 +// MAYTRAP-NEXT: ret void +// +// ERRNO-LABEL: define dso_local void @test_scalbn_var1( +// ERRNO-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// ERRNO-NEXT: [[ENTRY:.*:]] +// ERRNO-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// ERRNO-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// ERRNO-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// ERRNO-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// ERRNO-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// ERRNO-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// ERRNO-NEXT: [[CALL:%.*]] = call double @scalbn(double noundef [[TMP0]], i32 noundef 9) #[[ATTR2]] +// ERRNO-NEXT: store double [[CALL]], ptr [[D1_ASCAST]], align 8 +// ERRNO-NEXT: ret void +// +void test_scalbn_var1(double a) { + double D1 = __builtin_scalbn(a, 9); +} +// CHECK-LABEL: define dso_local void @test_scalbn_var2( +// CHECK-SAME: i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// CHECK-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.ldexp.f64.i32(double 1.540000e+01, i32 [[TMP0]]) +// CHECK-NEXT: store double [[TMP1]], ptr [[D1_ASCAST]], align 8 +// CHECK-NEXT: ret void +// DEFAULT-LABEL: define dso_local void @test_scalbn_var2( +// DEFAULT-SAME: i32 noundef [[B:%.*]]) #[[ATTR0]] { +// DEFAULT-NEXT: [[ENTRY:.*:]] +// DEFAULT-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// DEFAULT-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// DEFAULT-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// DEFAULT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// DEFAULT-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP1:%.*]] = call double @llvm.ldexp.f64.i32(double 1.540000e+01, i32 [[TMP0]]) +// DEFAULT-NEXT: store double [[TMP1]], ptr [[D1_ASCAST]], align 8 +// DEFAULT-NEXT: ret void +// +// IGNORE-LABEL: define dso_local void @test_scalbn_var2( +// IGNORE-SAME: i32 noundef [[B:%.*]]) #[[ATTR0]] { +// IGNORE-NEXT: [[ENTRY:.*:]] +// IGNORE-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// IGNORE-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// IGNORE-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// IGNORE-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// IGNORE-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP0:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP1:%.*]] = call double @llvm.ldexp.f64.i32(double 1.540000e+01, i32 [[TMP0]]) +// IGNORE-NEXT: store double [[TMP1]], ptr [[D1_ASCAST]], align 8 +// IGNORE-NEXT: ret void +// +// STRICT-LABEL: define dso_local void @test_scalbn_var2( +// STRICT-SAME: i32 noundef [[B:%.*]]) #[[ATTR0]] { +// STRICT-NEXT: [[ENTRY:.*:]] +// STRICT-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// STRICT-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// STRICT-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// STRICT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// STRICT-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP0:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP1:%.*]] = call double @llvm.ldexp.f64.i32(double 1.540000e+01, i32 [[TMP0]]) +// STRICT-NEXT: store double [[TMP1]], ptr [[D1_ASCAST]], align 8 +// STRICT-NEXT: ret void +// +// MAYTRAP-LABEL: define dso_local void @test_scalbn_var2( +// MAYTRAP-SAME: i32 noundef [[B:%.*]]) #[[ATTR0]] { +// MAYTRAP-NEXT: [[ENTRY:.*:]] +// MAYTRAP-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// MAYTRAP-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// MAYTRAP-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// MAYTRAP-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// MAYTRAP-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP0:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP1:%.*]] = call double @llvm.ldexp.f64.i32(double 1.540000e+01, i32 [[TMP0]]) +// MAYTRAP-NEXT: store double [[TMP1]], ptr [[D1_ASCAST]], align 8 +// MAYTRAP-NEXT: ret void +// +// ERRNO-LABEL: define dso_local void @test_scalbn_var2( +// ERRNO-SAME: i32 noundef [[B:%.*]]) #[[ATTR0]] { +// ERRNO-NEXT: [[ENTRY:.*:]] +// ERRNO-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// ERRNO-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// ERRNO-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// ERRNO-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// ERRNO-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: [[TMP0:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: [[CALL:%.*]] = call double @scalbn(double noundef 1.540000e+01, i32 noundef [[TMP0]]) #[[ATTR2]] +// ERRNO-NEXT: store double [[CALL]], ptr [[D1_ASCAST]], align 8 +// ERRNO-NEXT: ret void +// +void test_scalbn_var2(int b) { + double D1 = __builtin_scalbn(15.4, b); +} +// CHECK-LABEL: define dso_local void @test_scalbn_var3( +// CHECK-SAME: double noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// CHECK-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP0]], i32 [[TMP1]]) +// CHECK-NEXT: store double [[TMP2]], ptr [[D1_ASCAST]], align 8 +// CHECK-NEXT: ret void +// DEFAULT-LABEL: define dso_local void @test_scalbn_var3( +// DEFAULT-SAME: double noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// DEFAULT-NEXT: [[ENTRY:.*:]] +// DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// DEFAULT-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// DEFAULT-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// DEFAULT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// DEFAULT-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// DEFAULT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// DEFAULT-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// DEFAULT-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// DEFAULT-NEXT: [[TMP2:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP0]], i32 [[TMP1]]) +// DEFAULT-NEXT: store double [[TMP2]], ptr [[D1_ASCAST]], align 8 +// DEFAULT-NEXT: ret void +// +// IGNORE-LABEL: define dso_local void @test_scalbn_var3( +// IGNORE-SAME: double noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// IGNORE-NEXT: [[ENTRY:.*:]] +// IGNORE-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// IGNORE-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// IGNORE-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// IGNORE-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// IGNORE-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// IGNORE-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// IGNORE-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// IGNORE-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// IGNORE-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// IGNORE-NEXT: [[TMP2:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP0]], i32 [[TMP1]]) +// IGNORE-NEXT: store double [[TMP2]], ptr [[D1_ASCAST]], align 8 +// IGNORE-NEXT: ret void +// +// STRICT-LABEL: define dso_local void @test_scalbn_var3( +// STRICT-SAME: double noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// STRICT-NEXT: [[ENTRY:.*:]] +// STRICT-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// STRICT-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// STRICT-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// STRICT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// STRICT-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// STRICT-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// STRICT-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// STRICT-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// STRICT-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// STRICT-NEXT: [[TMP2:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP0]], i32 [[TMP1]]) +// STRICT-NEXT: store double [[TMP2]], ptr [[D1_ASCAST]], align 8 +// STRICT-NEXT: ret void +// +// MAYTRAP-LABEL: define dso_local void @test_scalbn_var3( +// MAYTRAP-SAME: double noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// MAYTRAP-NEXT: [[ENTRY:.*:]] +// MAYTRAP-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// MAYTRAP-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// MAYTRAP-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// MAYTRAP-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// MAYTRAP-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// MAYTRAP-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// MAYTRAP-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// MAYTRAP-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// MAYTRAP-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// MAYTRAP-NEXT: [[TMP2:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP0]], i32 [[TMP1]]) +// MAYTRAP-NEXT: store double [[TMP2]], ptr [[D1_ASCAST]], align 8 +// MAYTRAP-NEXT: ret void +// +// ERRNO-LABEL: define dso_local void @test_scalbn_var3( +// ERRNO-SAME: double noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// ERRNO-NEXT: [[ENTRY:.*:]] +// ERRNO-NEXT: [[A_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// ERRNO-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// ERRNO-NEXT: [[D1:%.*]] = alloca double, align 8, addrspace(5) +// ERRNO-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// ERRNO-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// ERRNO-NEXT: [[D1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D1]] to ptr +// ERRNO-NEXT: store double [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// ERRNO-NEXT: store i32 [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: [[TMP0:%.*]] = load double, ptr [[A_ADDR_ASCAST]], align 8 +// ERRNO-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// ERRNO-NEXT: [[CALL:%.*]] = call double @scalbn(double noundef [[TMP0]], i32 noundef [[TMP1]]) #[[ATTR2]] +// ERRNO-NEXT: store double [[CALL]], ptr [[D1_ASCAST]], align 8 +// ERRNO-NEXT: ret void +// +void test_scalbn_var3(double a, int b) { + double D1 = __builtin_scalbn(a, b); +} diff --git a/clang/test/CodeGenCUDA/increment-index-for-thunks.cu b/clang/test/CodeGenCUDA/increment-index-for-thunks.cu new file mode 100644 index 0000000000000..48dbf6ef82b7d --- /dev/null +++ b/clang/test/CodeGenCUDA/increment-index-for-thunks.cu @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -target-cpu gfx942 \ +// RUN: -emit-llvm -xhip %s -o - | FileCheck %s --check-prefix=GCN +// RUN: %clang_cc1 -fcuda-is-device -triple spirv64-amd-amdhsa \ +// RUN: -emit-llvm -xhip %s -o - | FileCheck %s --check-prefix=SPIRV + +// GCN: @_ZTV1C = linkonce_odr unnamed_addr addrspace(1) constant { [5 x ptr addrspace(1)], [4 x ptr addrspace(1)] } { [5 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) addrspacecast (ptr @_ZN1B2f2Ev to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) addrspacecast (ptr @_ZN1C2f1Ev to ptr addrspace(1))], [4 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 -8 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) addrspacecast (ptr @_ZThn8_N1C2f1Ev to ptr addrspace(1))] }, comdat, align 8 +// GCN: @_ZTV1B = linkonce_odr unnamed_addr addrspace(1) constant { [3 x ptr addrspace(1)] } { [3 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) addrspacecast (ptr @_ZN1B2f2Ev to ptr addrspace(1))] }, comdat, align 8 +// GCN: @_ZTV1A = linkonce_odr unnamed_addr addrspace(1) constant { [4 x ptr addrspace(1)] } { [4 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) addrspacecast (ptr @__cxa_pure_virtual to ptr addrspace(1))] }, comdat, align 8 +// SPIRV: @_ZTV1C = linkonce_odr unnamed_addr addrspace(1) constant { [5 x ptr addrspace(1)], [4 x ptr addrspace(1)] } { [5 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) addrspacecast (ptr addrspace(4) @_ZN1B2f2Ev to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) addrspacecast (ptr addrspace(4) @_ZN1C2f1Ev to ptr addrspace(1))], [4 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 -8 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) addrspacecast (ptr addrspace(4) @_ZThn8_N1C2f1Ev to ptr addrspace(1))] }, comdat, align 8 +// SPIRV: @_ZTV1B = linkonce_odr unnamed_addr addrspace(1) constant { [3 x ptr addrspace(1)] } { [3 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) addrspacecast (ptr addrspace(4) @_ZN1B2f2Ev to ptr addrspace(1))] }, comdat, align 8 +// SPIRV: @_ZTV1A = linkonce_odr unnamed_addr addrspace(1) constant { [4 x ptr addrspace(1)] } { [4 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) addrspacecast (ptr addrspace(4) @__cxa_pure_virtual to ptr addrspace(1))] }, comdat, align 8 + +struct A { + __attribute__((device)) A() { } + virtual void neither_device_nor_host_f() = 0 ; + __attribute__((device)) virtual void f1() = 0; + +}; + +struct B { + __attribute__((device)) B() { } + __attribute__((device)) virtual void f2() { }; +}; + +struct C : public B, public A { + __attribute__((device)) C() : B(), A() { } + + virtual void neither_device_nor_host_f() override { } + __attribute__((device)) virtual void f1() override { } + +}; + +__attribute__((device)) void test() { + C obj; +} diff --git a/clang/test/CodeGenCUDA/lambda-constexpr-capture.cu b/clang/test/CodeGenCUDA/lambda-constexpr-capture.cu new file mode 100644 index 0000000000000..1a1db63ceb717 --- /dev/null +++ b/clang/test/CodeGenCUDA/lambda-constexpr-capture.cu @@ -0,0 +1,135 @@ +// RUN: %clang_cc1 -emit-llvm -x hip %s -o - -triple x86_64-linux-gnu \ +// RUN: | FileCheck -check-prefixes=CHECK,HOST %s +// RUN: %clang_cc1 -emit-llvm -x hip %s -o - -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: | FileCheck -check-prefixes=CHECK,DEV %s + +#include "Inputs/cuda.h" + +// CHECK: %class.anon = type { ptr, float, ptr, ptr } +// CHECK: %class.anon.0 = type { ptr, float, ptr, ptr } +// CHECK: %class.anon.1 = type { ptr, ptr, ptr } +// CHECK: %class.anon.2 = type { ptr, float, ptr, ptr } + +// HOST: call void @_ZN8DevByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon) +// DEV: define amdgpu_kernel void @_ZN8DevByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon) + +// Only the device function passes arugments by value. +namespace DevByVal { +__device__ float fun(float x, float y) { + return x; +} + +float fun(const float &x, const float &y) { + return x; +} + +template +void __global__ kernel(F f) +{ + f(1); +} + +void test(float const * fl, float const * A, float * Vf) +{ + float constexpr small(1.0e-25); + + auto lambda = [=] __device__ __host__ (unsigned int n) { + float const value = fun(small, fl[0]); + Vf[0] = value * A[0]; + }; + kernel<<<1, 1>>>(lambda); +} +} + +// HOST: call void @_ZN9HostByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon.0) +// DEV: define amdgpu_kernel void @_ZN9HostByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon.0) + +// Only the host function passes arugments by value. +namespace HostByVal { +float fun(float x, float y) { + return x; +} + +__device__ float fun(const float &x, const float &y) { + return x; +} + +template +void __global__ kernel(F f) +{ + f(1); +} + +void test(float const * fl, float const * A, float * Vf) +{ + float constexpr small(1.0e-25); + + auto lambda = [=] __device__ __host__ (unsigned int n) { + float const value = fun(small, fl[0]); + Vf[0] = value * A[0]; + }; + kernel<<<1, 1>>>(lambda); +} +} + +// HOST: call void @_ZN9BothByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon.1) +// DEV: define amdgpu_kernel void @_ZN9BothByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon.1) + +// Both the host and device functions pass arugments by value. +namespace BothByVal { +float fun(float x, float y) { + return x; +} + +__device__ float fun(float x, float y) { + return x; +} + +template +void __global__ kernel(F f) +{ + f(1); +} + +void test(float const * fl, float const * A, float * Vf) +{ + float constexpr small(1.0e-25); + + auto lambda = [=] __device__ __host__ (unsigned int n) { + float const value = fun(small, fl[0]); + Vf[0] = value * A[0]; + }; + kernel<<<1, 1>>>(lambda); +} +} + +// HOST: call void @_ZN12NeitherByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon.2) +// DEV: define amdgpu_kernel void @_ZN12NeitherByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon.2) + +// Neither the host nor device function passes arugments by value. +namespace NeitherByVal { +float fun(const float& x, const float& y) { + return x; +} + +__device__ float fun(const float& x, const float& y) { + return x; +} + +template +void __global__ kernel(F f) +{ + f(1); +} + +void test(float const * fl, float const * A, float * Vf) +{ + float constexpr small(1.0e-25); + + auto lambda = [=] __device__ __host__ (unsigned int n) { + float const value = fun(small, fl[0]); + Vf[0] = value * A[0]; + }; + kernel<<<1, 1>>>(lambda); +} +} diff --git a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp index 5d49cc0544b9c..6bbae52d9c961 100644 --- a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp +++ b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp @@ -112,9 +112,9 @@ const B& f(A *a) { // CHECK: attributes #[[ATTR3]] = { nounwind } // CHECK: attributes #[[ATTR4]] = { noreturn } //. -// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" } +// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" } // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) } -// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" } +// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" } // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR3]] = { nounwind } // WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR4]] = { noreturn } //. diff --git a/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp b/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp new file mode 100644 index 0000000000000..f7438c374dd32 --- /dev/null +++ b/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp @@ -0,0 +1,17 @@ +// Check that if we are compiling with fgpu-rdc amdgpu-enable-hipstdpar is not +// passed to CC1, to avoid eager, per TU, removal of potentially accessible +// functions. + +// RUN: %clang -### --hipstdpar --offload-arch=gfx906 -nogpulib -nogpuinc %s \ +// RUN: --hipstdpar-path=%S/../Driver/Inputs/hipstdpar \ +// RUN: --hipstdpar-thrust-path=%S/../Driver/Inputs/hipstdpar/thrust \ +// RUN: --hipstdpar-prim-path=%S/../Driver/Inputs/hipstdpar/rocprim 2>&1 \ +// RUN: | FileCheck %s -check-prefix=NORDC +// NORDC: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar" + +// RUN: %clang -### --hipstdpar --offload-arch=gfx906 -nogpulib -nogpuinc %s \ +// RUN: -fgpu-rdc --hipstdpar-path=%S/../Driver/Inputs/hipstdpar \ +// RUN: --hipstdpar-thrust-path=%S/../Driver/Inputs/hipstdpar/thrust \ +// RUN: --hipstdpar-prim-path=%S/../Driver/Inputs/hipstdpar/rocprim 2>&1 \ +// RUN: | FileCheck %s -check-prefix=RDC +// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar" diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl new file mode 100644 index 0000000000000..f554d2f72f869 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl @@ -0,0 +1,27 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL1.2 \ +// RUN: -emit-llvm -o - | FileCheck %s + +// CHECK-LABEL: @test_builtin_amdgcn_cvt_off_f32_i4_ui( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[N:%.*]], ptr addrspace(5) [[N_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 [[TMP0]]) +// CHECK-NEXT: ret float [[TMP1]] +// +float test_builtin_amdgcn_cvt_off_f32_i4_ui(unsigned n) { + return __builtin_amdgcn_cvt_off_f32_i4(n); +} + +// CHECK-LABEL: @test_builtin_amdgcn_cvt_off_f32_i4_i( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[N:%.*]], ptr addrspace(5) [[N_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 [[TMP0]]) +// CHECK-NEXT: ret float [[TMP1]] +// +float test_builtin_amdgcn_cvt_off_f32_i4_i(int n) { + return __builtin_amdgcn_cvt_off_f32_i4(n); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl similarity index 90% rename from clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl rename to clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl index fc5649d8a41f7..b56c92bc3fe34 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl @@ -1,5 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s // REQUIRES: amdgpu-registered-target typedef unsigned int u32; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl index 3403b69e07e4b..5e3ed9027c17a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl @@ -170,3 +170,12 @@ v3u32 test_amdgcn_raw_ptr_buffer_load_b96_non_const_soffset(__amdgpu_buffer_rsrc v4u32 test_amdgcn_raw_ptr_buffer_load_b128_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { return __builtin_amdgcn_raw_buffer_load_b128(rsrc, /*offset=*/0, soffset, /*aux=*/0); } + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_lds( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(3) [[LDS:%.*]], i32 1, i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 2, i32 3) +// CHECK-NEXT: ret void +// +void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int offset, int soffset) { + __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 1, offset, soffset, 2, 3); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl index 7f73cdd61c80d..2cf7f3dc6f80e 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl @@ -3,6 +3,10 @@ // RUN: -emit-llvm -cl-std=CL2.0 -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -S \ // RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=ISA %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm \ +// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12 %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S \ +// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12ISA %s // Test llvm.amdgcn.image.bvh.intersect.ray intrinsic. @@ -12,12 +16,18 @@ // Postfix l indicates the 1st argument is i64 and postfix h indicates // the 4/5-th arguments are half4. +typedef unsigned char uchar; typedef unsigned int uint; typedef unsigned long ulong; +typedef float float3 __attribute__((ext_vector_type(3))); typedef float float4 __attribute__((ext_vector_type(4))); typedef double double4 __attribute__((ext_vector_type(4))); typedef half half4 __attribute__((ext_vector_type(4))); +typedef uint uint2 __attribute__((ext_vector_type(2))); typedef uint uint4 __attribute__((ext_vector_type(4))); +typedef uint uint8 __attribute__((ext_vector_type(8))); +typedef uint uint10 __attribute__((ext_vector_type(10))); +typedef ulong ulong2 __attribute__((ext_vector_type(2))); // CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f32 // ISA: image_bvh_intersect_ray @@ -59,3 +69,71 @@ void test_image_bvh_intersect_ray_lh(global uint4* out, ulong node_ptr, ray_origin, ray_dir, ray_inv_dir, texture_descr); } +#if __has_builtin(__builtin_amdgcn_image_bvh8_intersect_ray) +// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh8.intersect.ray( +// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin, +// GFX12: <3 x float> %ray_dir, i32 %offset, <4 x i32> %texture_descr) +// GFX12ISA: image_bvh8_intersect_ray +void test_image_bvh8_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin, + float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask, + float3 ray_origin, float3 ray_dir, uint offset, uint4 texture_descr) +{ + *ret_vdata = __builtin_amdgcn_image_bvh8_intersect_ray(node_ptr, ray_extent, + instance_mask, ray_origin, ray_dir, offset, texture_descr, + ret_ray_origin, ret_ray_dir); +} +#endif + +#if __has_builtin(__builtin_amdgcn_image_bvh_dual_intersect_ray) +// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh.dual.intersect.ray( +// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin, +// GFX12: <3 x float> %ray_dir, <2 x i32> %offset, <4 x i32> %texture_descr) +// GFX12ISA: image_bvh_dual_intersect_ray +void test_builtin_amdgcn_image_bvh_dual_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin, + float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask, + float3 ray_origin, float3 ray_dir, uint2 offset, uint4 texture_descr) +{ + *ret_vdata = __builtin_amdgcn_image_bvh_dual_intersect_ray(node_ptr, ray_extent, + instance_mask, ray_origin, ray_dir, offset, texture_descr, + ret_ray_origin, ret_ray_dir); +} +#endif + +#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn) +// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn( +// GFX12: i32 %addr, i32 %data0, <4 x i32> %data1, i32 0) +// GFX12ISA: ds_bvh_stack_push4_pop1_rtn +void test_builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(uint* ret_vdst, uint* ret_addr, + uint addr, uint data0, uint4 data1) +{ + uint2 ret = __builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(addr, data0, data1, /*constant offset=*/0); + *ret_vdst = ret.x; + *ret_addr = ret.y; +} +#endif + +#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn) +// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn( +// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0) +// GFX12ISA: ds_bvh_stack_push8_pop1_rtn +void test_builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(uint* ret_vdst, uint* ret_addr, + uint addr, uint data0, uint8 data1) +{ + uint2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(addr, data0, data1, /*constant offset=*/0); + *ret_vdst = ret.x; + *ret_addr = ret.y; +} +#endif + +#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn) +// GFX12: call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn( +// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0) +// GFX12ISA: ds_bvh_stack_push8_pop2_rtn +void test_builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(ulong* ret_vdst, uint* ret_addr, + uint addr, uint data0, uint8 data1) +{ + ulong2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(addr, data0, data1, /*constant offset=*/0); + *ret_vdst = ret.x; + *ret_addr = ret.y; +} +#endif diff --git a/clang/test/CodeGenOpenCL/check-atomic-alignment.cl b/clang/test/CodeGenOpenCL/check-atomic-alignment.cl new file mode 100644 index 0000000000000..4159ac882f226 --- /dev/null +++ b/clang/test/CodeGenOpenCL/check-atomic-alignment.cl @@ -0,0 +1,41 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx942 \ +// RUN: %s -emit-llvm -o - -disable-llvm-passes | FileCheck %s + +// REQUIRES: amdgpu-registered-target + +// `Ptr.getElementType()` in `CheckAtomicAlignment` returns +// %struct.__half2 = type { %union.anon } +// Check we do not crash when handling that. + +typedef half __attribute__((ext_vector_type(2))) half2; +typedef short __attribute__((ext_vector_type(2))) short2; + +struct __half2 { + union { + struct { + half x; + half y; + }; + half2 data; + }; +}; + +// CHECK-LABEL: define dso_local <2 x half> @test_flat_add_2f16( +// CHECK-SAME: ptr noundef [[ADDR:%.*]], <2 x half> noundef [[VAL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VAL_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: store ptr [[ADDR]], ptr addrspace(5) [[ADDR_ADDR]], align 8 +// CHECK-NEXT: store <2 x half> [[VAL]], ptr addrspace(5) [[VAL_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[ADDR_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(5) [[VAL_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]] +// CHECK-NEXT: ret <2 x half> [[TMP2]] +// +half2 test_flat_add_2f16(short2 *addr, half2 val) { + return __builtin_amdgcn_flat_atomic_fadd_v2f16((struct __half2*)addr, val); +} +//. +// CHECK: [[META4]] = !{} +//. diff --git a/clang/test/Driver/XRay/xray-instrument.c b/clang/test/Driver/XRay/xray-instrument.c index 48e20c45be4ac..d3cefb6680f4c 100644 --- a/clang/test/Driver/XRay/xray-instrument.c +++ b/clang/test/Driver/XRay/xray-instrument.c @@ -3,6 +3,11 @@ // RUN: %clang -### --target=x86_64-apple-darwin -fxray-instrument -c %s -o /dev/null 2>&1 | FileCheck %s // RUN: not %clang -### --target=x86_64-pc-windows -fxray-instrument -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR +/// Checking -fxray-instrument with offloading and -Xarch_host +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -Xarch_host -fxray-instrument -c %s -o /dev/null 2>&1 | FileCheck %s +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -x hip --offload-arch=gfx906 -nogpulib -nogpuinc -fxray-instrument -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -x hip --offload-arch=gfx906 -nogpulib -nogpuinc -Xarch_host -fxray-instrument -c %s -o /dev/null 2>&1 | FileCheck %s + // CHECK: "-cc1" {{.*}}"-fxray-instrument" // ERR: error: unsupported option '-fxray-instrument' for target diff --git a/clang/test/Driver/amdgpu-debug.cl b/clang/test/Driver/amdgpu-debug.cl index 0ca9bd7929993..e1617941fa7e9 100644 --- a/clang/test/Driver/amdgpu-debug.cl +++ b/clang/test/Driver/amdgpu-debug.cl @@ -46,3 +46,14 @@ // Check that -gheterogeneous-dwarf= fails for unknown option // RUN: not %clang -target amdgcn-amd-amdhsa -x cl -c -nogpuinc -nogpulib -emit-llvm -g -gheterogeneous-dwarf=unknown %s 2>&1 | FileCheck -check-prefix=CHECK-UNKNOWN %s // CHECK-UNKNOWN: error: invalid value + +// Check that =diexpression is implied by -g + spirv +// RUN: %clang -### -target spirv64-amd-amdhsa -x cl -c -nogpuinc -nogpulib -emit-llvm -g %s 2>&1 | FileCheck -check-prefix=CHECK-SPIRV %s +// CHECK-SPIRV: "-cc1" +// CHECK-SPIRV-DAG: "-mllvm" "-amdgpu-spill-cfi-saved-regs" +// CHECK-SPIRV-DAG: "-gheterogeneous-dwarf=diexpression" +// CHECK-SPIRV-DAG: "-debugger-tuning=gdb" + +// Check that =diexpr produces an error on spirv. +// RUN: not %clang -### -target spirv64-amd-amdhsa -x cl -c -nogpuinc -nogpulib -emit-llvm -g -gheterogeneous-dwarf=diexpr %s 2>&1 | FileCheck -check-prefix=CHECK-SPIRV-ERR %s +// CHECK-SPIRV-ERR: error: unsupported option '-gheterogeneous-dwarf=diexpr'; did you mean '-gheterogeneous-dwarf=diexpression'? diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c index c1c5aa8e90e68..ac923247e190a 100644 --- a/clang/test/Driver/amdgpu-toolchain.c +++ b/clang/test/Driver/amdgpu-toolchain.c @@ -18,21 +18,15 @@ // AS_LINK_UR: "-cc1as" // AS_LINK_UR: ld.lld{{.*}} "--no-undefined"{{.*}} "--unresolved-symbols=ignore-all" -// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \ -// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefixes=LTO,MCPU %s -// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \ +// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \ +// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=LTO %s +// LTO: clang{{.*}}"-flto=full"{{.*}}"-fconvergent-functions" +// LTO: ld.lld{{.*}}"-plugin-opt=mcpu=gfx906"{{.*}}"{{.*}} + +// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \ // RUN: -L. -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=MCPU %s -// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions" -// MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack" +// MCPU: ld.lld{{.*}}"-plugin-opt=mcpu=gfx906"{{.*}} // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \ // RUN: -fuse-ld=ld %s 2>&1 | FileCheck -check-prefixes=LD %s // LD: ld.lld - -// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \ -// RUN: -r %s 2>&1 | FileCheck -check-prefixes=RELO %s -// RELO-NOT: -shared - -// RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -stdlib -startfiles \ -// RUN: -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=STARTUP %s -// STARTUP: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o" diff --git a/clang/test/Driver/clang-offload-bundler-asserts-on.c b/clang/test/Driver/clang-offload-bundler-asserts-on.c index 55060c2c42e73..0710d2f6bf279 100644 --- a/clang/test/Driver/clang-offload-bundler-asserts-on.c +++ b/clang/test/Driver/clang-offload-bundler-asserts-on.c @@ -15,18 +15,18 @@ // Check code object compatibility for archive unbundling // // Create few code object bundles and archive them to create an input archive -// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa-gfx906,openmp-amdgcn-amd-amdhsa--gfx908 -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.simple.bundle +// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906,openmp-amdgcn-amd-amdhsa--gfx908 -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.simple.bundle // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:sramecc+:xnack+,openmp-amdgcn-amd-amdhsa--gfx908:sramecc+:xnack+ -inputs=%t.o,%t.tgt1,%t.tgt1 -outputs=%t.targetID1.bundle // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-,openmp-amdgcn-amd-amdhsa--gfx908:sramecc+:xnack- -inputs=%t.o,%t.tgt1,%t.tgt1 -outputs=%t.targetID2.bundle // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906:xnack-,openmp-amdgcn-amd-amdhsa--gfx908:xnack- -inputs=%t.o,%t.tgt1,%t.tgt1 -outputs=%t.targetID3.bundle // RUN: llvm-ar cr %t.input-archive.a %t.simple.bundle %t.targetID1.bundle %t.targetID2.bundle %t.targetID3.bundle // Tests to check compatibility between Bundle Entry ID formats i.e. between presence/absence of extra hyphen in case of missing environment field -// RUN: clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa--gfx906,openmp-amdgcn-amd-amdhsa-gfx908 -input=%t.input-archive.a -output=%t-archive-gfx906-simple.a -output=%t-archive-gfx908-simple.a -debug-only=CodeObjectCompatibility 2>&1 | FileCheck %s -check-prefix=BUNDLECOMPATIBILITY +// RUN: clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa--gfx906,openmp-amdgcn-amd-amdhsa--gfx908 -input=%t.input-archive.a -output=%t-archive-gfx906-simple.a -output=%t-archive-gfx908-simple.a -debug-only=CodeObjectCompatibility 2>&1 | FileCheck %s -check-prefix=BUNDLECOMPATIBILITY // BUNDLECOMPATIBILITY: Compatible: Exact match: [CodeObject: openmp-amdgcn-amd-amdhsa--gfx906] : [Target: openmp-amdgcn-amd-amdhsa--gfx906] // BUNDLECOMPATIBILITY: Compatible: Exact match: [CodeObject: openmp-amdgcn-amd-amdhsa--gfx908] : [Target: openmp-amdgcn-amd-amdhsa--gfx908] -// RUN: clang-offload-bundler -unbundle -type=a -targets=hip-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa-gfx908 -input=%t.input-archive.a -output=%t-hip-archive-gfx906-simple.a -output=%t-hipv4-archive-gfx908-simple.a -hip-openmp-compatible -debug-only=CodeObjectCompatibility 2>&1 | FileCheck %s -check-prefix=HIPOpenMPCOMPATIBILITY +// RUN: clang-offload-bundler -unbundle -type=a -targets=hip-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908 -input=%t.input-archive.a -output=%t-hip-archive-gfx906-simple.a -output=%t-hipv4-archive-gfx908-simple.a -hip-openmp-compatible -debug-only=CodeObjectCompatibility 2>&1 | FileCheck %s -check-prefix=HIPOpenMPCOMPATIBILITY // HIPOpenMPCOMPATIBILITY: Compatible: Target IDs are compatible [CodeObject: openmp-amdgcn-amd-amdhsa--gfx906] : [Target: hip-amdgcn-amd-amdhsa--gfx906] // HIPOpenMPCOMPATIBILITY: Compatible: Target IDs are compatible [CodeObject: openmp-amdgcn-amd-amdhsa--gfx908] : [Target: hipv4-amdgcn-amd-amdhsa--gfx908] diff --git a/clang/test/Driver/clang-offload-bundler-standardize.c b/clang/test/Driver/clang-offload-bundler-standardize.c index 52f5ea038e47b..fd87fca4ff59d 100644 --- a/clang/test/Driver/clang-offload-bundler-standardize.c +++ b/clang/test/Driver/clang-offload-bundler-standardize.c @@ -15,20 +15,12 @@ // // Check code object compatibility for archive unbundling // -// Create an object bundle with and without env fields -// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,hip-amdgcn-amd-amdhsa-gfx906,hip-amdgcn-amd-amdhsa-gfx908 -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.bundle.no.env -// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple-,hip-amdgcn-amd-amdhsa--gfx906,hip-amdgcn-amd-amdhsa--gfx908 -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.bundle.env +// Create an object bundle +// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,hip-amdgcn-amd-amdhsa--gfx906,hip-amdgcn-amd-amdhsa--gfx908 -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.bundle - -// Unbundle bundle.no.env while providing targets with env -// RUN: clang-offload-bundler -unbundle -type=o -targets=hip-amdgcn-amd-amdhsa--gfx906,hip-amdgcn-amd-amdhsa--gfx908 -input=%t.bundle.no.env -output=%t-hip-amdgcn-amd-amdhsa--gfx906.bc -output=%t-hip-amdgcn-amd-amdhsa--gfx908.bc -debug-only=CodeObjectCompatibility 2>&1 | FileCheck %s -check-prefix=BUNDLE-NO-ENV -// BUNDLE-NO-ENV: Compatible: Exact match: [CodeObject: hip-amdgcn-amd-amdhsa--gfx906] : [Target: hip-amdgcn-amd-amdhsa--gfx906] -// BUNDLE-NO-ENV: Compatible: Exact match: [CodeObject: hip-amdgcn-amd-amdhsa--gfx908] : [Target: hip-amdgcn-amd-amdhsa--gfx908] - -// Unbundle bundle.env while providing targets with no env -// RUN: clang-offload-bundler -unbundle -type=o -targets=hip-amdgcn-amd-amdhsa-gfx906,hip-amdgcn-amd-amdhsa-gfx908 -input=%t.bundle.env -output=%t-hip-amdgcn-amd-amdhsa-gfx906.bc -output=%t-hip-amdgcn-amd-amdhsa-gfx908.bc -debug-only=CodeObjectCompatibility 2>&1 | FileCheck %s -check-prefix=BUNDLE-ENV -// BUNDLE-ENV: Compatible: Exact match: [CodeObject: hip-amdgcn-amd-amdhsa--gfx906] : [Target: hip-amdgcn-amd-amdhsa--gfx906] -// BUNDLE-ENV: Compatible: Exact match: [CodeObject: hip-amdgcn-amd-amdhsa--gfx908] : [Target: hip-amdgcn-amd-amdhsa--gfx908] +// RUN: clang-offload-bundler -unbundle -type=o -targets=hip-amdgcn-amd-amdhsa--gfx906,hip-amdgcn-amd-amdhsa--gfx908 -input=%t.bundle -output=%t-hip-amdgcn-amd-amdhsa--gfx906.bc -output=%t-hip-amdgcn-amd-amdhsa--gfx908.bc -debug-only=CodeObjectCompatibility 2>&1 | FileCheck %s -check-prefix=BUNDLE +// BUNDLE: Compatible: Exact match: [CodeObject: hip-amdgcn-amd-amdhsa--gfx906] : [Target: hip-amdgcn-amd-amdhsa--gfx906] +// BUNDLE: Compatible: Exact match: [CodeObject: hip-amdgcn-amd-amdhsa--gfx908] : [Target: hip-amdgcn-amd-amdhsa--gfx908] // Some code so that we can create a binary out of this file. int A = 0; diff --git a/clang/test/Driver/clang-offload-bundler-zlib.c b/clang/test/Driver/clang-offload-bundler-zlib.c index b026e2ec99877..211601c2c7fbb 100644 --- a/clang/test/Driver/clang-offload-bundler-zlib.c +++ b/clang/test/Driver/clang-offload-bundler-zlib.c @@ -66,6 +66,30 @@ // NOHOST-V3-DAG: hip-amdgcn-amd-amdhsa--gfx900 // NOHOST-V3-DAG: hip-amdgcn-amd-amdhsa--gfx906 +// Check compression/decompression of offload bundle using version 2 format. +// +// RUN: env OFFLOAD_BUNDLER_COMPRESS=1 OFFLOAD_BUNDLER_VERBOSE=1 COMPRESSED_BUNDLE_FORMAT_VERSION=2 \ +// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \ +// RUN: -input=%t.tgt1 -input=%t.tgt2 -output=%t.hip.bundle.bc 2>&1 | \ +// RUN: FileCheck -check-prefix=COMPRESS-V2 %s +// RUN: clang-offload-bundler -type=bc -list -input=%t.hip.bundle.bc | FileCheck -check-prefix=NOHOST-V2 %s +// RUN: env OFFLOAD_BUNDLER_VERBOSE=1 \ +// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \ +// RUN: -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.hip.bundle.bc -unbundle 2>&1 | \ +// RUN: FileCheck -check-prefix=DECOMPRESS-V2 %s +// RUN: diff %t.tgt1 %t.res.tgt1 +// RUN: diff %t.tgt2 %t.res.tgt2 +// +// COMPRESS-V2: Compressed bundle format version: 2 +// COMPRESS-V2: Compression method used: zlib +// COMPRESS-V2: Compression level: 6 +// DECOMPRESS-V2: Compressed bundle format version: 2 +// DECOMPRESS-V2: Decompression method: zlib +// DECOMPRESS-V2: Hashes match: Yes +// NOHOST-V2-NOT: host- +// NOHOST-V2-DAG: hip-amdgcn-amd-amdhsa--gfx900 +// NOHOST-V2-DAG: hip-amdgcn-amd-amdhsa--gfx906 + // Check -compression-level= option // RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \ diff --git a/clang/test/Driver/clang-offload-bundler-zstd.c b/clang/test/Driver/clang-offload-bundler-zstd.c index 667d9554daec7..c1123ae5acb38 100644 --- a/clang/test/Driver/clang-offload-bundler-zstd.c +++ b/clang/test/Driver/clang-offload-bundler-zstd.c @@ -29,11 +29,11 @@ // RUN: diff %t.tgt1 %t.res.tgt1 // RUN: diff %t.tgt2 %t.res.tgt2 // -// CHECK: Compressed bundle format version: 2 +// CHECK: Compressed bundle format version: 3 // CHECK: Total file size (including headers): [[SIZE:[0-9]*]] bytes // CHECK: Compression method used: zstd // CHECK: Compression level: 3 -// CHECK: Compressed bundle format version: 2 +// CHECK: Compressed bundle format version: 3 // CHECK: Total file size (from header): [[SIZE]] bytes // CHECK: Decompression method: zstd // CHECK: Hashes match: Yes diff --git a/clang/test/Driver/clang-offload-bundler.c b/clang/test/Driver/clang-offload-bundler.c index 1909ff2d71d03..95ea058665c3b 100644 --- a/clang/test/Driver/clang-offload-bundler.c +++ b/clang/test/Driver/clang-offload-bundler.c @@ -116,7 +116,7 @@ // RUN: not clang-offload-bundler -type=i -targets=host-powerpc64le-ibm-linux-gnu,openmp-powerpc64le-ibm-linux-gnu,xpenmp-x86_xx-pc-linux-gnu -input=%t.i -input=%t.tgt1 -input=%t.tgt2 -output=%t.bundle.i 2>&1 | FileCheck %s --check-prefix CK-ERR8B // CK-ERR8B: error: invalid target 'xpenmp-x86_xx-pc-linux-gnu', unknown offloading kind 'xpenmp', unknown target triple 'x86_xx-pc-linux-gnu' -// RUN: not clang-offload-bundler -type=i -targets=openmp-powerpc64le-linux,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -input=%t.i -input=%t.tgt1 -input=%t.tgt2 -output=%t.bundle.i 2>&1 | FileCheck %s --check-prefix CK-ERR9A +// RUN: not clang-offload-bundler -type=i -targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -input=%t.i -input=%t.tgt1 -output=%t.bundle.i 2>&1 | FileCheck %s --check-prefix CK-ERR9A // CK-ERR9A: error: expecting exactly one host target but got 0 // RUN: not clang-offload-bundler -type=i -targets=host-%itanium_abi_triple,host-%itanium_abi_triple,openmp-x86_64-pc-linux-gnu -input=%t.i -input=%t.tgt1 -input=%t.tgt2 -output=%t.bundle.i 2>&1 | FileCheck %s --check-prefix CK-ERR9B @@ -238,7 +238,7 @@ // Check that bindler prints an error if given host bundle does not exist in the fat binary. // RUN: not clang-offload-bundler -type=s -targets=host-amdgcn-xxx-linux-gnu,openmp-powerpc64le-ibm-linux-gnu -output=%t.res.s -output=%t.res.tgt1 -input=%t.bundle3.s -unbundle 2>&1 | FileCheck %s --check-prefix CK-NO-HOST-BUNDLE -// CK-NO-HOST-BUNDLE: error: Can't find bundles for host-amdgcn-xxx-linux-gnu +// CK-NO-HOST-BUNDLE: error: Can't find bundles for host-amdgcn-xxx-linux- // Check missing host entry is allowed with -allow-missing-bundles // RUN: clang-offload-bundler -type=s -targets=host-amdgcn-xxx-linux-gnu,openmp-powerpc64le-ibm-linux-gnu -output=%t.res.s -output=%t.res.tgt1 -input=%t.bundle3.s -unbundle -allow-missing-bundles @@ -520,32 +520,32 @@ // Check archive unbundling // // Create few code object bundles and archive them to create an input archive -// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa-gfx906,openmp-amdgcn-amd-amdhsa--gfx908 -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.simple.bundle +// RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906,openmp-amdgcn-amd-amdhsa--gfx908 -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.simple.bundle // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx903 -input=%t.o -input=%t.tgt1 -output=%t.simple1.bundle // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,hip-amdgcn-amd-amdhsa--gfx906 -input=%t.o -input=%t.tgt1 -output=%t.simple2.bundle // RUN: llvm-ar cr %t.input-archive.a %t.simple.bundle %t.simple1.bundle %t.simple2.bundle -// RUN: clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa-gfx906,openmp-amdgcn-amd-amdhsa-gfx908 -input=%t.input-archive.a -output=%t-archive-gfx906-simple.a -output=%t-archive-gfx908-simple.a +// RUN: clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa--gfx906,openmp-amdgcn-amd-amdhsa--gfx908 -input=%t.input-archive.a -output=%t-archive-gfx906-simple.a -output=%t-archive-gfx908-simple.a // RUN: llvm-ar t %t-archive-gfx906-simple.a | FileCheck %s -check-prefix=GFX906 -// RUN: clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa-gfx906:xnack+ -input=%t.input-archive.a -output=%t-archive-gfx906-simple.a +// RUN: clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa--gfx906:xnack+ -input=%t.input-archive.a -output=%t-archive-gfx906-simple.a // RUN: llvm-ar t %t-archive-gfx906-simple.a | FileCheck %s -check-prefix=GFX906 // GFX906: simple-openmp-amdgcn-amd-amdhsa--gfx906 // RUN: llvm-ar t %t-archive-gfx908-simple.a | FileCheck %s -check-prefix=GFX908 // GFX908-NOT: {{gfx906}} -// RUN: not clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa-gfx906,openmp-amdgcn-amd-amdhsa-gfx906:sramecc+ -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.bad.bundle 2>&1 | FileCheck %s -check-prefix=BADTARGETS +// RUN: not clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-amdgcn-amd-amdhsa--gfx906,openmp-amdgcn-amd-amdhsa--gfx906:sramecc+ -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.bad.bundle 2>&1 | FileCheck %s -check-prefix=BADTARGETS // BADTARGETS: error: Cannot bundle inputs with conflicting targets: 'openmp-amdgcn-amd-amdhsa--gfx906' and 'openmp-amdgcn-amd-amdhsa--gfx906:sramecc+' // Check for error if no compatible code object is found in the heterogeneous archive library -// RUN: not clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa-gfx803 -input=%t.input-archive.a -output=%t-archive-gfx803-incompatible.a 2>&1 | FileCheck %s -check-prefix=INCOMPATIBLEARCHIVE +// RUN: not clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa--gfx803 -input=%t.input-archive.a -output=%t-archive-gfx803-incompatible.a 2>&1 | FileCheck %s -check-prefix=INCOMPATIBLEARCHIVE // INCOMPATIBLEARCHIVE: error: no compatible code object found for the target 'openmp-amdgcn-amd-amdhsa--gfx803' in heterogeneous archive library // Check creation of empty archive if allow-missing-bundles is present and no compatible code object is found in the heterogeneous archive library -// RUN: clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa-gfx803 -input=%t.input-archive.a -output=%t-archive-gfx803-empty.a -allow-missing-bundles +// RUN: clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa--gfx803 -input=%t.input-archive.a -output=%t-archive-gfx803-empty.a -allow-missing-bundles // RUN: cat %t-archive-gfx803-empty.a | FileCheck %s -check-prefix=EMPTYARCHIVE // EMPTYARCHIVE: ! // Check compatibility of OpenMP code objects found in the heterogeneous archive library with HIP code objects of the target -// RUN: clang-offload-bundler -unbundle -type=a -targets=hip-amdgcn-amd-amdhsa-gfx906,hipv4-amdgcn-amd-amdhsa-gfx908 -input=%t.input-archive.a -output=%t-hip-archive-gfx906-simple.a -output=%t-hipv4-archive-gfx908-simple.a -hip-openmp-compatible +// RUN: clang-offload-bundler -unbundle -type=a -targets=hip-amdgcn-amd-amdhsa--gfx906,hipv4-amdgcn-amd-amdhsa--gfx908 -input=%t.input-archive.a -output=%t-hip-archive-gfx906-simple.a -output=%t-hipv4-archive-gfx908-simple.a -hip-openmp-compatible // RUN: llvm-ar t %t-hip-archive-gfx906-simple.a | FileCheck %s -check-prefix=HIPOPENMPCOMPAT // HIPOPENMPCOMPAT: simple-openmp-amdgcn-amd-amdhsa--gfx906 // RUN: llvm-ar t %t-hipv4-archive-gfx908-simple.a | FileCheck %s -check-prefix=HIPv4OPENMPCOMPAT @@ -557,6 +557,12 @@ // RUN: llvm-ar t %T/hip-openmp_906.a | FileCheck -check-prefix=OPENMPHIPCOMPAT %s // OPENMPHIPCOMPAT: hip_bundle1-hip-amdgcn-amd-amdhsa--gfx906 +// Check if a malformat bundle id can be detected and an error can be emitted. +// RUN: not clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa -output=%T/hip-openmp_906.a -input=%T/hip_archive.a -hip-openmp-compatible 2>&1 | FileCheck %s -check-prefix=ERROR-WRONG-FORMAT +// ERROR-WRONG-FORMAT: error: Targets need to follow the format '-', where '' follows the format '----[-[:target features]]'. +// RUN: not clang-offload-bundler -unbundle -type=a -targets=openmp-amdgcn-amd-amdhsa-gfx906 -output=%T/hip-openmp_906.a -input=%T/hip_archive.a -hip-openmp-compatible 2>&1 | FileCheck %s -check-prefix=ERROR-NO-ENV +// ERROR-NO-ENV: error: no compatible code object found for the target 'openmp-amdgcn-amd-amdhsa--' + // Some code so that we can create a binary out of this file. int A = 0; void test_func(void) { diff --git a/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip b/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip new file mode 100644 index 0000000000000..f17e56acfb7f7 --- /dev/null +++ b/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip @@ -0,0 +1,12 @@ +// RUN: %clang -### -nogpuinc -nogpulib --offload-arch=gfx1030 --offload-arch=gfx1100 --offload-arch=gfx1101 --target=x86_64-linux-gnu -MD -MF tmp.d %s 2>&1 | FileCheck %s + +// CHECK-NOT: {{.*}}clang{{.*}}"-target-cpu" "gfx1030"{{.*}}"-dependency-file" "tmp.d" +// CHECK: {{.*}}lld{{.*}}"-plugin-opt=mcpu=gfx1030" +// CHECK-NOT: {{.*}}clang{{.*}}"-target-cpu" "gfx1100"{{.*}}"-dependency-file" "tmp.d" +// CHECK: {{.*}}lld{{.*}}"-plugin-opt=mcpu=gfx1100" +// CHECK-NOT: {{.*}}clang{{.*}}"-target-cpu" "gfx1101"{{.*}}"-dependency-file" "tmp.d" +// CHECK: {{.*}}lld{{.*}}"-plugin-opt=mcpu=gfx1101" +// CHECK: {{.*}}clang-offload-bundler +// CHECK: {{.*}}clang{{.*}}"-target-cpu"{{.*}}"-dependency-file" "tmp.d" + +void main(){} diff --git a/clang/test/Driver/hip-link-bc-to-bc.hip b/clang/test/Driver/hip-link-bc-to-bc.hip index 249e82ae060fe..b372551e200e3 100644 --- a/clang/test/Driver/hip-link-bc-to-bc.hip +++ b/clang/test/Driver/hip-link-bc-to-bc.hip @@ -8,10 +8,10 @@ // RUN: --no-offload-new-driver %t/bundle1.bc %t/bundle2.bc \ // RUN: 2>&1 | FileCheck -check-prefix=BITCODE %s -// BITCODE: "{{.*}}clang-offload-bundler" "-type=bc" "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx906" "-input={{.*}}bundle1.bc" "-output=[[B1HOST:.*\.bc]]" "-output=[[B1DEV1:.*\.bc]]" "-unbundle" "-allow-missing-bundles" +// BITCODE: "{{.*}}clang-offload-bundler" "-type=bc" "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-unknown-gfx906" "-input={{.*}}bundle1.bc" "-output=[[B1HOST:.*\.bc]]" "-output=[[B1DEV1:.*\.bc]]" "-unbundle" "-allow-missing-bundles" // BITCODE: "{{.*}}clang{{.*}}" "-o" "[[B1DEV2:.*bundle1-gfx906-.*\.bc]]" "-x" "ir" "[[B1DEV1]]" -// BITCODE: "{{.*}}clang-offload-bundler" "-type=bc" "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx906" "-input={{.*}}bundle2.bc" "-output=[[B2HOST:.*\.bc]]" "-output=[[B2DEV1:.*\.bc]]" "-unbundle" "-allow-missing-bundles" +// BITCODE: "{{.*}}clang-offload-bundler" "-type=bc" "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-unknown-gfx906" "-input={{.*}}bundle2.bc" "-output=[[B2HOST:.*\.bc]]" "-output=[[B2DEV1:.*\.bc]]" "-unbundle" "-allow-missing-bundles" // BITCODE: "{{.*}}clang{{.*}}" "-o" "[[B2DEV2:.*bundle2-gfx906-.*\.bc]]" "-x" "ir" "[[B2DEV1]]" // BITCODE: "{{.*}}llvm-link" "-o" "bundle1-hip-amdgcn-amd-amdhsa-gfx906.bc" "[[B1DEV2]]" "[[B2DEV2]]" @@ -25,9 +25,9 @@ // RUN: --no-offload-new-driver %t/bundle.bc -L%t -lhipbundle \ // RUN: 2>&1 | FileCheck -check-prefix=ARCHIVE %s -// ARCHIVE: "{{.*}}clang-offload-bundler" "-type=bc" "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx906" "-input={{.*}}bundle.bc" "-output=[[HOST:.*\.bc]]" "-output=[[DEV1:.*\.bc]]" "-unbundle" "-allow-missing-bundles" +// ARCHIVE: "{{.*}}clang-offload-bundler" "-type=bc" "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-unknown-gfx906" "-input={{.*}}bundle.bc" "-output=[[HOST:.*\.bc]]" "-output=[[DEV1:.*\.bc]]" "-unbundle" "-allow-missing-bundles" // ARCHIVE: "{{.*}}clang{{.*}}" "-o" "[[DEV2:.*\.bc]]" "-x" "ir" "[[DEV1]]" -// ARCHIVE: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}libhipbundle.a" "-targets=hip-amdgcn-amd-amdhsa-gfx906" "-output=[[AR:.*\.a]]" "-allow-missing-bundles" "-hip-openmp-compatible" +// ARCHIVE: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}libhipbundle.a" "-targets=hip-amdgcn-amd-amdhsa-unknown-gfx906" "-output=[[AR:.*\.a]]" "-allow-missing-bundles" "-hip-openmp-compatible" // ARCHIVE: "{{.*}}llvm-link" "-o" "bundle-hip-amdgcn-amd-amdhsa-gfx906.bc" "[[DEV2]]" "[[AR]]" diff --git a/clang/test/Driver/hip-link-bundle-archive.hip b/clang/test/Driver/hip-link-bundle-archive.hip index 55cd301bc1595..96b8eb12f1452 100644 --- a/clang/test/Driver/hip-link-bundle-archive.hip +++ b/clang/test/Driver/hip-link-bundle-archive.hip @@ -70,11 +70,10 @@ // RUN: -nogpuinc -nogpulib %s -fgpu-rdc %t/hipBundled2.lib \ // RUN: 2>&1 | FileCheck -check-prefix=MSVC %s -// GNU1: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}[[LIB:libhipBundled\.a]]" "-targets=hip-amdgcn-amd-amdhsa-gfx1030" "-output=[[A1030:.*\.a]]" "-allow-missing-bundles" -// GNU2: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}[[LIB:libhipBundled\.a\.5\.2]]" "-targets=hip-amdgcn-amd-amdhsa-gfx1030" "-output=[[A1030:.*\.a]]" "-allow-missing-bundles" +// GNU1: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}[[LIB:libhipBundled\.a]]" "-targets=hip-amdgcn-amd-amdhsa-unknown-gfx1030" "-output=[[A1030:.*\.a]]" "-allow-missing-bundles" +// GNU2: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}[[LIB:libhipBundled\.a\.5\.2]]" "-targets=hip-amdgcn-amd-amdhsa-unknown-gfx1030" "-output=[[A1030:.*\.a]]" "-allow-missing-bundles" // GNU: "{{.*}}lld{{.*}}" {{.*}}"-plugin-opt=mcpu=gfx1030" {{.*}} "[[A1030]]" -// GNU: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}[[LIB]]" "-targets=hip-amdgcn-amd-amdhsa-gfx906" "-output=[[A906:.*\.a]]" "-allow-missing-bundles" - +// GNU: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}[[LIB]]" "-targets=hip-amdgcn-amd-amdhsa-unknown-gfx906" "-output=[[A906:.*\.a]]" "-allow-missing-bundles" // GNU: "{{.*}}lld{{.*}}" {{.*}}"-plugin-opt=mcpu=gfx906" {{.*}} "[[A906]]" // GNU-L: "{{.*}}ld{{.*}}" {{.*}}"-o" "a.out" {{.*}}"-lhipBundled" // GNU-LA: "{{.*}}ld{{.*}}" {{.*}}"-o" "a.out" {{.*}}"-l:libhipBundled.a" @@ -82,8 +81,8 @@ // NONARCHIVE-NOT: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*libNonArchive\.a}}" // NONE-NOT: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*NoneExist\.a}}" -// MSVC: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}hipBundled2.lib" "-targets=hip-amdgcn-amd-amdhsa-gfx1030" "-output=[[A1030:.*\.a]]" "-allow-missing-bundles" +// MSVC: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}hipBundled2.lib" "-targets=hip-amdgcn-amd-amdhsa-unknown-gfx1030" "-output=[[A1030:.*\.a]]" "-allow-missing-bundles" // MSVC: "{{.*}}lld{{.*}}" {{.*}}"-plugin-opt=mcpu=gfx1030" {{.*}} "[[A1030]]" -// MSVC: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}hipBundled2.lib" "-targets=hip-amdgcn-amd-amdhsa-gfx906" "-output=[[A906:.*\.a]]" "-allow-missing-bundles" +// MSVC: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}hipBundled2.lib" "-targets=hip-amdgcn-amd-amdhsa-unknown-gfx906" "-output=[[A906:.*\.a]]" "-allow-missing-bundles" // MSVC: "{{.*}}lld{{.*}}" {{.*}}"-plugin-opt=mcpu=gfx906" {{.*}} "[[A906]]" // MSVC: "{{.*}}link{{.*}}" {{.*}}"-out:a.exe" {{.*}}hipBundled2.lib" diff --git a/clang/test/Driver/hip-offload-compress-zlib.hip b/clang/test/Driver/hip-offload-compress-zlib.hip index dc43e73e7ae8f..9f542c2053296 100644 --- a/clang/test/Driver/hip-offload-compress-zlib.hip +++ b/clang/test/Driver/hip-offload-compress-zlib.hip @@ -13,7 +13,7 @@ // RUN: 2>&1 | FileCheck %s // CHECK: clang-offload-bundler{{.*}} -type=bc -// CHECK-SAME: -targets={{.*}}hip-amdgcn-amd-amdhsa-gfx1100,hip-amdgcn-amd-amdhsa-gfx1101 +// CHECK-SAME: -targets={{.*}}hip-amdgcn-amd-amdhsa-unknown-gfx1100,hip-amdgcn-amd-amdhsa-unknown-gfx1101 // CHECK-SAME: -compress -verbose -compression-level=9 // CHECK: Compressed bundle format @@ -26,7 +26,7 @@ // RUN: 2>&1 | FileCheck -check-prefix=UNBUNDLE %s // UNBUNDLE: clang-offload-bundler{{.*}} "-type=bc" -// UNBUNDLE-SAME: -targets={{.*}}hip-amdgcn-amd-amdhsa-gfx1100,hip-amdgcn-amd-amdhsa-gfx1101 +// UNBUNDLE-SAME: -targets={{.*}}hip-amdgcn-amd-amdhsa-unknown-gfx1100,hip-amdgcn-amd-amdhsa-unknown-gfx1101 // UNBUNDLE-SAME: -unbundle // UNBUNDLE-SAME: -verbose diff --git a/clang/test/Driver/hip-offload-compress-zstd.hip b/clang/test/Driver/hip-offload-compress-zstd.hip index 69f28ab22ba60..dfe681feeb647 100644 --- a/clang/test/Driver/hip-offload-compress-zstd.hip +++ b/clang/test/Driver/hip-offload-compress-zstd.hip @@ -13,7 +13,7 @@ // RUN: 2>&1 | FileCheck %s // CHECK: clang-offload-bundler{{.*}} -type=bc -// CHECK-SAME: -targets={{.*}}hip-amdgcn-amd-amdhsa-gfx1100,hip-amdgcn-amd-amdhsa-gfx1101 +// CHECK-SAME: -targets={{.*}}hip-amdgcn-amd-amdhsa-unknown-gfx1100,hip-amdgcn-amd-amdhsa-unknown-gfx1101 // CHECK-SAME: -compress -verbose -compression-level=9 // CHECK: Compressed bundle format @@ -26,7 +26,7 @@ // RUN: 2>&1 | FileCheck -check-prefix=UNBUNDLE %s // UNBUNDLE: clang-offload-bundler{{.*}} "-type=bc" -// UNBUNDLE-SAME: -targets={{.*}}hip-amdgcn-amd-amdhsa-gfx1100,hip-amdgcn-amd-amdhsa-gfx1101 +// UNBUNDLE-SAME: -targets={{.*}}hip-amdgcn-amd-amdhsa-unknown-gfx1100,hip-amdgcn-amd-amdhsa-unknown-gfx1101 // UNBUNDLE-SAME: -unbundle // UNBUNDLE-SAME: -verbose diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip index 8c13137735fb9..95444fe5eefb0 100644 --- a/clang/test/Driver/hip-options.hip +++ b/clang/test/Driver/hip-options.hip @@ -242,3 +242,7 @@ // NO-WARN-ATOMIC: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-Werror=atomic-alignment" {{.*}} "-Wno-error=atomic-alignment" // NO-WARN-ATOMIC-NOT: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-Werror=atomic-alignment" // NO-WARN-ATOMIC-NOT: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-Wno-error=atomic-alignment" + +// Check --offload-compress does not cause warning. +// RUN: %clang -### -Werror --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib \ +// RUN: --offload-arch=gfx1100 --offload-compress --offload-host-only -M %s diff --git a/clang/test/Driver/hip-rdc-device-only.hip b/clang/test/Driver/hip-rdc-device-only.hip index cbb2433f2a6a2..f5d83d013c86a 100644 --- a/clang/test/Driver/hip-rdc-device-only.hip +++ b/clang/test/Driver/hip-rdc-device-only.hip @@ -88,7 +88,7 @@ // COMMON-SAME: {{.*}} {{".*a.cu"}} // COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}" -// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-unknown-gfx803,hip-amdgcn-amd-amdhsa-unknown-gfx900" // COMMON-SAME: "-output=a-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}" // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" @@ -118,7 +118,7 @@ // COMMON-SAME: {{.*}} {{".*b.hip"}} // COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}" -// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-unknown-gfx803,hip-amdgcn-amd-amdhsa-unknown-gfx900" // COMMON-SAME: "-output=b-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}" // SAVETEMP: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu" @@ -148,7 +148,7 @@ // SAVETEMP-SAME: {{.*}} "-o" {{"a.*.ll"}} "-x" "ir" [[A_GFX900_TMP_BC]] // SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll" -// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-unknown-gfx803,hip-amdgcn-amd-amdhsa-unknown-gfx900" // SAVETEMP-SAME: "-output=a-hip-amdgcn-amd-amdhsa.ll" // SAVETEMP: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu" @@ -178,7 +178,7 @@ // SAVETEMP-SAME: {{.*}} "-o" {{"b.*.ll"}} "-x" "ir" [[B_GFX900_TMP_BC]] // SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll" -// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-unknown-gfx803,hip-amdgcn-amd-amdhsa-unknown-gfx900" // SAVETEMP-SAME: "-output=b-hip-amdgcn-amd-amdhsa.ll" // FAIL: error: cannot specify -o when generating multiple output files diff --git a/clang/test/Driver/hip-runtime-libs-linux.hip b/clang/test/Driver/hip-runtime-libs-linux.hip index a4cd2733114b6..eda87d0aa4b6c 100644 --- a/clang/test/Driver/hip-runtime-libs-linux.hip +++ b/clang/test/Driver/hip-runtime-libs-linux.hip @@ -20,6 +20,11 @@ // RUN: --rocm-path=%S/Inputs/rocm %t.o -frtlib-add-rpath 2>&1 \ // RUN: | FileCheck -check-prefixes=ROCM-RPATH %s +// Test that a canonical HIP runtime path is passed to the -rpath flag +// RUN: %clang -### --hip-link --target=x86_64-linux-gnu \ +// RUN: --rocm-path=%S/Inputs/rocm/./bin/../include/../ %t.o -frtlib-add-rpath 2>&1 \ +// RUN: | FileCheck -check-prefixes=ROCM-RPATH-CANONICAL %s + // Test detecting latest /opt/rocm-{release} directory. // RUN: rm -rf %t && mkdir -p %t/opt // RUN: cp -r %S/Inputs/rocm %t/opt/rocm-3.9.0-1234 @@ -55,6 +60,7 @@ // ROCM-PATH: "-L[[HIPRT:.*/Inputs/rocm/lib]]" "-lamdhip64" // ROCM-RPATH: "-L[[HIPRT:.*/Inputs/rocm/lib]]" "-rpath" "[[HIPRT]]" "-lamdhip64" +// ROCM-RPATH-CANONICAL: "-rpath" "{{.*/rocm/lib}}" "-lamdhip64" // ROCM-REL: "-L[[HIPRT:.*/opt/rocm-3.10.0/lib]]" "-lamdhip64" // NOHIPRT-NOT: "-L{{.*/Inputs/rocm/lib}}" // NOHIPRT-NOT: "-rpath" "{{.*/Inputs/rocm/lib}}" diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip index 054db261d8e57..6c69d1d51a260 100644 --- a/clang/test/Driver/hip-toolchain-no-rdc.hip +++ b/clang/test/Driver/hip-toolchain-no-rdc.hip @@ -187,8 +187,8 @@ // Check mixed AMDGCNSPIRV and concrete GPU arch. // -// AMDGCNSPIRV: "-cc1" "-triple" "spirv64-amd-amdhsa" {{.*}}"-emit-obj" {{.*}} "-o" "[[AMDGCNSPV_OBJ:.*o]]" -// AMDGCNSPIRV: {{".*llvm-link.*"}} "-o" "[[AMDGCNSPV_TMP:.*out]]" "[[AMDGCNSPV_OBJ]]" +// AMDGCNSPIRV: "-cc1" "-triple" "spirv64-amd-amdhsa" {{.*}}"-emit-llvm-bc" {{.*}} "-o" "[[AMDGCNSPV_BC:.*bc]]" +// AMDGCNSPIRV: {{".*llvm-link.*"}} "-o" "[[AMDGCNSPV_TMP:.*out]]" "[[AMDGCNSPV_BC]]" // AMDGCNSPIRV: {{".*llvm-spirv.*"}} "--spirv-max-version=1.6" "--spirv-ext=+all" {{.*}} "[[AMDGCNSPV_TMP]]" {{.*}}"-o" "[[AMDGCNSPV_CO:.*out]]" // AMDGCNSPIRV: "-cc1" "-triple" "amdgcn-amd-amdhsa" {{.*}}"-emit-obj" {{.*}}"-target-cpu" "gfx900"{{.*}} "-o" "[[GFX900_OBJ:.*o]]" // AMDGCNSPIRV: {{".*lld.*"}} {{.*}}"-plugin-opt=mcpu=gfx900" {{.*}} "-o" "[[GFX900_CO:.*out]]" {{.*}}"[[GFX900_OBJ]]" diff --git a/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip new file mode 100644 index 0000000000000..4439547ea8ad9 --- /dev/null +++ b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip @@ -0,0 +1,35 @@ +// RUN: %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 -flto-partitions=42 \ +// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \ +// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefix=FIXED-PARTS + +// FIXED-PARTS-NOT: "*.llvm-link" +// FIXED-PARTS-NOT: ".*opt" +// FIXED-PARTS-NOT: ".*llc" +// FIXED-PARTS: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" +// FIXED-PARTS-SAME: "--lto-partitions=42" +// FIXED-PARTS-SAME: "-plugin-opt=mcpu=gfx803" +// FIXED-PARTS-SAME: "-o" "{{.*out}}" "{{.*bc}}" + +// RUN: not %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 -flto-partitions=a \ +// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \ +// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV0 + +// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '-flto-partitions=a' + +// RUN: not %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 -flto-partitions=0 \ +// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \ +// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV1 + +// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '-flto-partitions=0' diff --git a/clang/test/Driver/hip-toolchain-rdc-separate.hip b/clang/test/Driver/hip-toolchain-rdc-separate.hip index 80f325c5d7373..d3c7d2d5be55b 100644 --- a/clang/test/Driver/hip-toolchain-rdc-separate.hip +++ b/clang/test/Driver/hip-toolchain-rdc-separate.hip @@ -41,7 +41,7 @@ // CHECK-SAME: {{.*}} [[A_SRC]] // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900,host-x86_64-unknown-linux-gnu" +// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa-unknown-gfx803,hip-amdgcn-amd-amdhsa-unknown-gfx900,host-x86_64-unknown-linux-gnu" // CHECK-SAME: "-output=[[A_O:.*a.o]]" "-input=[[A_BC1]]" "-input=[[A_BC2]]" "-input=[[A_OBJ_HOST]]" // CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" @@ -76,7 +76,7 @@ // CHECK-SAME: {{.*}} [[B_SRC]] // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900,host-x86_64-unknown-linux-gnu" +// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa-unknown-gfx803,hip-amdgcn-amd-amdhsa-unknown-gfx900,host-x86_64-unknown-linux-gnu" // CHECK-SAME: "-output=[[B_O:.*b.o]]" "-input=[[B_BC1]]" "-input=[[B_BC2]]" "-input=[[B_OBJ_HOST]]" // RUN: touch %t/a.o %t/b.o @@ -99,22 +99,22 @@ // RUN: 2>&1 | FileCheck -check-prefixes=LINK,LLD-FIN,LINK-NOBUNDLE,LINK-NOEMBED %s // LINK-HOST-UNBUNDLE: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// LINK-HOST-UNBUNDLE-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// LINK-HOST-UNBUNDLE-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-unknown-gfx803,hip-amdgcn-amd-amdhsa-unknown-gfx900" // LINK-HOST-UNBUNDLE-SAME: "-input=[[A_O:.*a.o]]" "-output=[[A_OBJ_HOST:.*o]]" "-output={{.*o}}" "-output={{.*o}}" // LINK-HOST-UNBUNDLE: "-unbundle" "-allow-missing-bundles" // LINK-HOST-UNBUNDLE: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// LINK-HOST-UNBUNDLE-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// LINK-HOST-UNBUNDLE-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-unknown-gfx803,hip-amdgcn-amd-amdhsa-unknown-gfx900" // LINK-HOST-UNBUNDLE-SAME: "-input=[[B_O:.*b.o]]" "-output=[[B_OBJ_HOST:.*o]]" "-output={{.*o}}" "-output={{.*o}}" // LINK-HOST-UNBUNDLE: "-unbundle" "-allow-missing-bundles" // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-unknown-gfx803,hip-amdgcn-amd-amdhsa-unknown-gfx900" // LINK-SAME: "-input=[[A_O:.*a.o]]" "-output={{.*o}}" "-output=[[A_BC1:.*o]]" "-output=[[A_BC2:.*o]]" // LINK-SAME: "-unbundle" "-allow-missing-bundles" // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-unknown-gfx803,hip-amdgcn-amd-amdhsa-unknown-gfx900" // LINK-SAME: "-input=[[B_O:.*b.o]]" "-output={{.*o}}" "-output=[[B_BC1:.*o]]" "-output=[[B_BC2:.*o]]" // LINK-SAME: "-unbundle" "-allow-missing-bundles" diff --git a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip index 5276faf31bdc2..05d276ba57bda 100644 --- a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip +++ b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip @@ -48,6 +48,7 @@ // CHECK-NOT: ".*opt" // CHECK-NOT: ".*llc" // CHECK: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" +// CHECK-SAME: "--lto-partitions={{[0-9]+}}" // CHECK-SAME: "-plugin-opt=mcpu=gfx803" // CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]] @@ -76,6 +77,7 @@ // CHECK-NOT: ".*opt" // CHECK-NOT: ".*llc" // CHECK: [[LLD]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" +// CHECK-SAME: "--lto-partitions={{[0-9]+}}" // CHECK-SAME: "-plugin-opt=mcpu=gfx900" // CHECK-SAME: "--whole-archive" // CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[A_BC2]] [[B_BC2]] diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip index 96da423144c1c..8cf7a968f7368 100644 --- a/clang/test/Driver/hip-toolchain-rdc.hip +++ b/clang/test/Driver/hip-toolchain-rdc.hip @@ -146,6 +146,7 @@ // CHECK-NOT: ".*opt" // CHECK-NOT: ".*llc" // CHECK: {{".*lld.*"}} {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" +// CHECK-SAME: "--lto-partitions={{[0-9]+}}" // CHECK-SAME: "-plugin-opt=mcpu=gfx900" // CHECK-SAME: "-o" "[[IMG_DEV2:.*.out]]" [[A_BC2]] [[B_BC2]] @@ -160,3 +161,21 @@ // output the executable // LNX: [[LD:".*ld.*"]] {{.*}}"-o" "a.out" {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]] [[OBJBUNDLE]] // MSVC: [[LD:".*lld-link.*"]] {{.*}}"-out:a.exe" {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]] [[OBJBUNDLE]] + +// Check -flto-partitions + +// RUN: %clang -### -fgpu-rdc --offload-arch=gfx90a -nogpulib -nogpuinc --no-offload-new-driver \ +// RUN: -L. -foffload-lto %s 2>&1 | FileCheck -check-prefix=LTO_DEFAULT %s +// LTO_DEFAULT: lld{{.*}}"--lto-partitions=8" + +// RUN: %clang -### -fgpu-rdc --offload-arch=gfx90a -nogpulib -nogpuinc --no-offload-new-driver \ +// RUN: -L. -foffload-lto -flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s +// LTO_PARTS: lld{{.*}}"--lto-partitions=42" + +// RUN: not %clang -### -fgpu-rdc --offload-arch=gfx90a -nogpulib -nogpuinc --no-offload-new-driver \ +// RUN: -L. -foffload-lto -flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s +// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '-flto-partitions=a' + +// RUN: not %clang -### -fgpu-rdc --offload-arch=gfx90a -nogpulib -nogpuinc --no-offload-new-driver \ +// RUN: -L. -foffload-lto -flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s +// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '-flto-partitions=0' diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip index 43c25c1b3d74e..a604b3cbb006b 100644 --- a/clang/test/Headers/__clang_hip_math.hip +++ b/clang/test/Headers/__clang_hip_math.hip @@ -1,5 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: amdgpu-registered-target +// REQUIRES: spirv-registered-target // Test without OCML_BASIC_ROUNDED_OPERATIONS // RUN: %clang_cc1 -include __clang_hip_runtime_wrapper.h \ @@ -26,6 +27,14 @@ // RUN: -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -O1 -fgpu-approx-transcendentals -o - \ // RUN: -D__HIPCC_RTC__ | FileCheck -check-prefixes=CHECK,APPROX %s +// Check that we use the AMDGCNSPIRV address space map +// RUN: %clang_cc1 -include __clang_hip_runtime_wrapper.h \ +// RUN: -internal-isystem %S/../../lib/Headers/cuda_wrappers \ +// RUN: -internal-isystem %S/Inputs/include \ +// RUN: -triple spirv64-amd-amdhsa -aux-triple x86_64-unknown-unknown \ +// RUN: -emit-llvm %s -fcuda-is-device -O1 -o - \ +// RUN: -D__HIPCC_RTC__ | FileCheck -check-prefixes=AMDGCNSPIRV %s + #define BOOL_TYPE int typedef unsigned long long uint64_t; @@ -57,6 +66,30 @@ typedef unsigned long long uint64_t; // CHECK-NEXT: [[RETVAL_2_I:%.*]] = phi i64 [ 0, [[CLEANUP_I]] ], [ [[__R_0_I]], [[WHILE_COND_I]] ] // CHECK-NEXT: ret i64 [[RETVAL_2_I]] // +// AMDGCNSPIRV-LABEL: @test___make_mantissa_base8( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: br label [[WHILE_COND_I:%.*]] +// AMDGCNSPIRV: while.cond.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_0_I:%.*]] = phi ptr addrspace(4) [ [[P:%.*]], [[ENTRY:%.*]] ], [ [[__TAGP_ADDR_1_I:%.*]], [[WHILE_BODY_I:%.*]] ] +// AMDGCNSPIRV-NEXT: [[__R_0_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[__R_1_I:%.*]], [[WHILE_BODY_I]] ] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], align 1, !tbaa [[TBAA5:![0-9]+]] +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP0]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I]], label [[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT:%.*]], label [[WHILE_BODY_I]] +// AMDGCNSPIRV: while.body.i: +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], -8 +// AMDGCNSPIRV-NEXT: [[OR_COND_I:%.*]] = icmp eq i8 [[TMP1]], 48 +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = shl i64 [[__R_0_I]], 3 +// AMDGCNSPIRV-NEXT: [[CONV5_I:%.*]] = zext nneg i8 [[TMP0]] to i64 +// AMDGCNSPIRV-NEXT: [[ADD_I:%.*]] = add i64 [[MUL_I]], -48 +// AMDGCNSPIRV-NEXT: [[SUB_I:%.*]] = add i64 [[ADD_I]], [[CONV5_I]] +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_IDX_I:%.*]] = zext i1 [[OR_COND_I]] to i64 +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], i64 [[__TAGP_ADDR_1_IDX_I]] +// AMDGCNSPIRV-NEXT: [[__R_1_I]] = select i1 [[OR_COND_I]], i64 [[SUB_I]], i64 [[__R_0_I]] +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND_I]], label [[WHILE_COND_I]], label [[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]], !llvm.loop [[LOOP8:![0-9]+]] +// AMDGCNSPIRV: _ZL21__make_mantissa_base8PKc.exit: +// AMDGCNSPIRV-NEXT: [[RETVAL_2_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_I]] ], [ [[__R_0_I]], [[WHILE_COND_I]] ] +// AMDGCNSPIRV-NEXT: ret i64 [[RETVAL_2_I]] +// extern "C" __device__ uint64_t test___make_mantissa_base8(const char *p) { return __make_mantissa_base8(p); } @@ -89,6 +122,30 @@ extern "C" __device__ uint64_t test___make_mantissa_base8(const char *p) { // CHECK-NEXT: [[RETVAL_2_I:%.*]] = phi i64 [ 0, [[CLEANUP_I]] ], [ [[__R_0_I]], [[WHILE_COND_I]] ] // CHECK-NEXT: ret i64 [[RETVAL_2_I]] // +// AMDGCNSPIRV-LABEL: @test___make_mantissa_base10( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: br label [[WHILE_COND_I:%.*]] +// AMDGCNSPIRV: while.cond.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_0_I:%.*]] = phi ptr addrspace(4) [ [[P:%.*]], [[ENTRY:%.*]] ], [ [[__TAGP_ADDR_1_I:%.*]], [[WHILE_BODY_I:%.*]] ] +// AMDGCNSPIRV-NEXT: [[__R_0_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[__R_1_I:%.*]], [[WHILE_BODY_I]] ] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP0]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I]], label [[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT:%.*]], label [[WHILE_BODY_I]] +// AMDGCNSPIRV: while.body.i: +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = add i8 [[TMP0]], -48 +// AMDGCNSPIRV-NEXT: [[OR_COND_I:%.*]] = icmp ult i8 [[TMP1]], 10 +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = mul i64 [[__R_0_I]], 10 +// AMDGCNSPIRV-NEXT: [[CONV5_I:%.*]] = zext nneg i8 [[TMP0]] to i64 +// AMDGCNSPIRV-NEXT: [[ADD_I:%.*]] = add i64 [[MUL_I]], -48 +// AMDGCNSPIRV-NEXT: [[SUB_I:%.*]] = add i64 [[ADD_I]], [[CONV5_I]] +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_IDX_I:%.*]] = zext i1 [[OR_COND_I]] to i64 +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], i64 [[__TAGP_ADDR_1_IDX_I]] +// AMDGCNSPIRV-NEXT: [[__R_1_I]] = select i1 [[OR_COND_I]], i64 [[SUB_I]], i64 [[__R_0_I]] +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND_I]], label [[WHILE_COND_I]], label [[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]], !llvm.loop [[LOOP11:![0-9]+]] +// AMDGCNSPIRV: _ZL22__make_mantissa_base10PKc.exit: +// AMDGCNSPIRV-NEXT: [[RETVAL_2_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_I]] ], [ [[__R_0_I]], [[WHILE_COND_I]] ] +// AMDGCNSPIRV-NEXT: ret i64 [[RETVAL_2_I]] +// extern "C" __device__ uint64_t test___make_mantissa_base10(const char *p) { return __make_mantissa_base10(p); } @@ -131,6 +188,44 @@ extern "C" __device__ uint64_t test___make_mantissa_base10(const char *p) { // CHECK-NEXT: [[RETVAL_2_I:%.*]] = phi i64 [ 0, [[CLEANUP_I]] ], [ [[__R_0_I]], [[WHILE_COND_I]] ] // CHECK-NEXT: ret i64 [[RETVAL_2_I]] // +// AMDGCNSPIRV-LABEL: @test___make_mantissa_base16( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: br label [[WHILE_COND_I:%.*]] +// AMDGCNSPIRV: while.cond.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_0_I:%.*]] = phi ptr addrspace(4) [ [[P:%.*]], [[ENTRY:%.*]] ], [ [[__TAGP_ADDR_1_I:%.*]], [[CLEANUP_I:%.*]] ] +// AMDGCNSPIRV-NEXT: [[__R_0_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[__R_2_I:%.*]], [[CLEANUP_I]] ] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP0]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I]], label [[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT:%.*]], label [[WHILE_BODY_I:%.*]] +// AMDGCNSPIRV: while.body.i: +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = add i8 [[TMP0]], -48 +// AMDGCNSPIRV-NEXT: [[OR_COND_I:%.*]] = icmp ult i8 [[TMP1]], 10 +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND_I]], label [[IF_END31_I:%.*]], label [[IF_ELSE_I:%.*]] +// AMDGCNSPIRV: if.else.i: +// AMDGCNSPIRV-NEXT: [[TMP2:%.*]] = add i8 [[TMP0]], -97 +// AMDGCNSPIRV-NEXT: [[OR_COND33_I:%.*]] = icmp ult i8 [[TMP2]], 6 +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND33_I]], label [[IF_END31_I]], label [[IF_ELSE17_I:%.*]] +// AMDGCNSPIRV: if.else17.i: +// AMDGCNSPIRV-NEXT: [[TMP3:%.*]] = add i8 [[TMP0]], -65 +// AMDGCNSPIRV-NEXT: [[OR_COND34_I:%.*]] = icmp ult i8 [[TMP3]], 6 +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND34_I]], label [[IF_END31_I]], label [[CLEANUP_I]] +// AMDGCNSPIRV: if.end31.i: +// AMDGCNSPIRV-NEXT: [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I]] ], [ -87, [[IF_ELSE_I]] ], [ -55, [[IF_ELSE17_I]] ] +// AMDGCNSPIRV-NEXT: [[MUL24_I:%.*]] = shl i64 [[__R_0_I]], 4 +// AMDGCNSPIRV-NEXT: [[CONV25_I:%.*]] = zext nneg i8 [[TMP0]] to i64 +// AMDGCNSPIRV-NEXT: [[ADD26_I:%.*]] = add i64 [[MUL24_I]], [[DOTSINK]] +// AMDGCNSPIRV-NEXT: [[ADD28_I:%.*]] = add i64 [[ADD26_I]], [[CONV25_I]] +// AMDGCNSPIRV-NEXT: [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], i64 1 +// AMDGCNSPIRV-NEXT: br label [[CLEANUP_I]] +// AMDGCNSPIRV: cleanup.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_I]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I]], [[IF_END31_I]] ], [ [[__TAGP_ADDR_0_I]], [[IF_ELSE17_I]] ] +// AMDGCNSPIRV-NEXT: [[__R_2_I]] = phi i64 [ [[ADD28_I]], [[IF_END31_I]] ], [ [[__R_0_I]], [[IF_ELSE17_I]] ] +// AMDGCNSPIRV-NEXT: [[COND_I:%.*]] = phi i1 [ true, [[IF_END31_I]] ], [ false, [[IF_ELSE17_I]] ] +// AMDGCNSPIRV-NEXT: br i1 [[COND_I]], label [[WHILE_COND_I]], label [[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]], !llvm.loop [[LOOP12:![0-9]+]] +// AMDGCNSPIRV: _ZL22__make_mantissa_base16PKc.exit: +// AMDGCNSPIRV-NEXT: [[RETVAL_2_I:%.*]] = phi i64 [ 0, [[CLEANUP_I]] ], [ [[__R_0_I]], [[WHILE_COND_I]] ] +// AMDGCNSPIRV-NEXT: ret i64 [[RETVAL_2_I]] +// extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) { return __make_mantissa_base16(p); } @@ -226,6 +321,89 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) { // CHECK-NEXT: [[RETVAL_0_I:%.*]] = phi i64 [ 0, [[CLEANUP_I20_I]] ], [ [[__R_0_I16_I]], [[WHILE_COND_I14_I]] ], [ 0, [[CLEANUP_I_I]] ], [ [[__R_0_I_I]], [[WHILE_COND_I_I]] ], [ 0, [[CLEANUP_I36_I]] ], [ [[__R_0_I32_I]], [[WHILE_COND_I30_I]] ] // CHECK-NEXT: ret i64 [[RETVAL_0_I]] // +// AMDGCNSPIRV-LABEL: @test___make_mantissa( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i8, ptr addrspace(4) [[P:%.*]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_I:%.*]] = icmp eq i8 [[TMP0]], 48 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[WHILE_COND_I27_I:%.*]] +// AMDGCNSPIRV: if.then.i: +// AMDGCNSPIRV-NEXT: [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[P]], i64 1 +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I14_I:%.*]] [ +// AMDGCNSPIRV-NEXT: i8 120, label [[WHILE_COND_I_I_PREHEADER:%.*]] +// AMDGCNSPIRV-NEXT: i8 88, label [[WHILE_COND_I_I_PREHEADER]] +// AMDGCNSPIRV-NEXT: ] +// AMDGCNSPIRV: while.cond.i.i.preheader: +// AMDGCNSPIRV-NEXT: br label [[WHILE_COND_I_I:%.*]] +// AMDGCNSPIRV: while.cond.i.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_0_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I_I:%.*]], [[CLEANUP_I_I:%.*]] ], [ [[INCDEC_PTR_I]], [[WHILE_COND_I_I_PREHEADER]] ] +// AMDGCNSPIRV-NEXT: [[__R_0_I_I:%.*]] = phi i64 [ [[__R_2_I_I:%.*]], [[CLEANUP_I_I]] ], [ 0, [[WHILE_COND_I_I_PREHEADER]] ] +// AMDGCNSPIRV-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I_I:%.*]] = icmp eq i8 [[TMP2]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT:%.*]], label [[WHILE_BODY_I_I:%.*]] +// AMDGCNSPIRV: while.body.i.i: +// AMDGCNSPIRV-NEXT: [[TMP3:%.*]] = add i8 [[TMP2]], -48 +// AMDGCNSPIRV-NEXT: [[OR_COND_I_I:%.*]] = icmp ult i8 [[TMP3]], 10 +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND_I_I]], label [[IF_END31_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// AMDGCNSPIRV: if.else.i.i: +// AMDGCNSPIRV-NEXT: [[TMP4:%.*]] = add i8 [[TMP2]], -97 +// AMDGCNSPIRV-NEXT: [[OR_COND33_I_I:%.*]] = icmp ult i8 [[TMP4]], 6 +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND33_I_I]], label [[IF_END31_I_I]], label [[IF_ELSE17_I_I:%.*]] +// AMDGCNSPIRV: if.else17.i.i: +// AMDGCNSPIRV-NEXT: [[TMP5:%.*]] = add i8 [[TMP2]], -65 +// AMDGCNSPIRV-NEXT: [[OR_COND34_I_I:%.*]] = icmp ult i8 [[TMP5]], 6 +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND34_I_I]], label [[IF_END31_I_I]], label [[CLEANUP_I_I]] +// AMDGCNSPIRV: if.end31.i.i: +// AMDGCNSPIRV-NEXT: [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I_I]] ], [ -87, [[IF_ELSE_I_I]] ], [ -55, [[IF_ELSE17_I_I]] ] +// AMDGCNSPIRV-NEXT: [[MUL24_I_I:%.*]] = shl i64 [[__R_0_I_I]], 4 +// AMDGCNSPIRV-NEXT: [[CONV25_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 +// AMDGCNSPIRV-NEXT: [[ADD26_I_I:%.*]] = add i64 [[MUL24_I_I]], [[DOTSINK]] +// AMDGCNSPIRV-NEXT: [[ADD28_I_I:%.*]] = add i64 [[ADD26_I_I]], [[CONV25_I_I]] +// AMDGCNSPIRV-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I]], i64 1 +// AMDGCNSPIRV-NEXT: br label [[CLEANUP_I_I]] +// AMDGCNSPIRV: cleanup.i.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_I_I]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I_I]], [[IF_END31_I_I]] ], [ [[__TAGP_ADDR_0_I_I]], [[IF_ELSE17_I_I]] ] +// AMDGCNSPIRV-NEXT: [[__R_2_I_I]] = phi i64 [ [[ADD28_I_I]], [[IF_END31_I_I]] ], [ [[__R_0_I_I]], [[IF_ELSE17_I_I]] ] +// AMDGCNSPIRV-NEXT: [[COND_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I]] ], [ false, [[IF_ELSE17_I_I]] ] +// AMDGCNSPIRV-NEXT: br i1 [[COND_I_I]], label [[WHILE_COND_I_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP12]] +// AMDGCNSPIRV: while.cond.i14.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_0_I15_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I24_I:%.*]], [[WHILE_BODY_I18_I:%.*]] ], [ [[INCDEC_PTR_I]], [[IF_THEN_I]] ] +// AMDGCNSPIRV-NEXT: [[__R_0_I16_I:%.*]] = phi i64 [ [[__R_1_I25_I:%.*]], [[WHILE_BODY_I18_I]] ], [ 0, [[IF_THEN_I]] ] +// AMDGCNSPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I17_I:%.*]] = icmp eq i8 [[TMP6]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I17_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], label [[WHILE_BODY_I18_I]] +// AMDGCNSPIRV: while.body.i18.i: +// AMDGCNSPIRV-NEXT: [[TMP7:%.*]] = and i8 [[TMP6]], -8 +// AMDGCNSPIRV-NEXT: [[OR_COND_I19_I:%.*]] = icmp eq i8 [[TMP7]], 48 +// AMDGCNSPIRV-NEXT: [[MUL_I20_I:%.*]] = shl i64 [[__R_0_I16_I]], 3 +// AMDGCNSPIRV-NEXT: [[CONV5_I21_I:%.*]] = zext nneg i8 [[TMP6]] to i64 +// AMDGCNSPIRV-NEXT: [[ADD_I22_I:%.*]] = add i64 [[MUL_I20_I]], -48 +// AMDGCNSPIRV-NEXT: [[SUB_I23_I:%.*]] = add i64 [[ADD_I22_I]], [[CONV5_I21_I]] +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_IDX_I_I:%.*]] = zext i1 [[OR_COND_I19_I]] to i64 +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_I24_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I]], i64 [[__TAGP_ADDR_1_IDX_I_I]] +// AMDGCNSPIRV-NEXT: [[__R_1_I25_I]] = select i1 [[OR_COND_I19_I]], i64 [[SUB_I23_I]], i64 [[__R_0_I16_I]] +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND_I19_I]], label [[WHILE_COND_I14_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP8]] +// AMDGCNSPIRV: while.cond.i27.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_0_I28_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I38_I:%.*]], [[WHILE_BODY_I31_I:%.*]] ], [ [[P]], [[ENTRY:%.*]] ] +// AMDGCNSPIRV-NEXT: [[__R_0_I29_I:%.*]] = phi i64 [ [[__R_1_I39_I:%.*]], [[WHILE_BODY_I31_I]] ], [ 0, [[ENTRY]] ] +// AMDGCNSPIRV-NEXT: [[TMP8:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I28_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I30_I:%.*]] = icmp eq i8 [[TMP8]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I30_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], label [[WHILE_BODY_I31_I]] +// AMDGCNSPIRV: while.body.i31.i: +// AMDGCNSPIRV-NEXT: [[TMP9:%.*]] = add i8 [[TMP8]], -48 +// AMDGCNSPIRV-NEXT: [[OR_COND_I32_I:%.*]] = icmp ult i8 [[TMP9]], 10 +// AMDGCNSPIRV-NEXT: [[MUL_I33_I:%.*]] = mul i64 [[__R_0_I29_I]], 10 +// AMDGCNSPIRV-NEXT: [[CONV5_I34_I:%.*]] = zext nneg i8 [[TMP8]] to i64 +// AMDGCNSPIRV-NEXT: [[ADD_I35_I:%.*]] = add i64 [[MUL_I33_I]], -48 +// AMDGCNSPIRV-NEXT: [[SUB_I36_I:%.*]] = add i64 [[ADD_I35_I]], [[CONV5_I34_I]] +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_IDX_I37_I:%.*]] = zext i1 [[OR_COND_I32_I]] to i64 +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_I38_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I28_I]], i64 [[__TAGP_ADDR_1_IDX_I37_I]] +// AMDGCNSPIRV-NEXT: [[__R_1_I39_I]] = select i1 [[OR_COND_I32_I]], i64 [[SUB_I36_I]], i64 [[__R_0_I29_I]] +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND_I32_I]], label [[WHILE_COND_I27_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP11]] +// AMDGCNSPIRV: _ZL15__make_mantissaPKc.exit: +// AMDGCNSPIRV-NEXT: [[RETVAL_0_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_I18_I]] ], [ [[__R_0_I16_I]], [[WHILE_COND_I14_I]] ], [ 0, [[CLEANUP_I_I]] ], [ [[__R_0_I_I]], [[WHILE_COND_I_I]] ], [ 0, [[WHILE_BODY_I31_I]] ], [ [[__R_0_I29_I]], [[WHILE_COND_I27_I]] ] +// AMDGCNSPIRV-NEXT: ret i64 [[RETVAL_0_I]] +// extern "C" __device__ uint64_t test___make_mantissa(const char *p) { return __make_mantissa(p); } @@ -235,6 +413,11 @@ extern "C" __device__ uint64_t test___make_mantissa(const char *p) { // CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i32 0, -2147483648) i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) // CHECK-NEXT: ret i32 [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_abs( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call noundef range(i32 0, -2147483648) addrspace(4) i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) +// AMDGCNSPIRV-NEXT: ret i32 [[TMP0]] +// extern "C" __device__ int test_abs(int x) { return abs(x); } @@ -244,6 +427,11 @@ extern "C" __device__ int test_abs(int x) { // CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true) // CHECK-NEXT: ret i64 [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_labs( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) addrspace(4) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true) +// AMDGCNSPIRV-NEXT: ret i64 [[TMP0]] +// extern "C" __device__ long test_labs(long x) { return labs(x); } @@ -253,6 +441,11 @@ extern "C" __device__ long test_labs(long x) { // CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true) // CHECK-NEXT: ret i64 [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_llabs( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) addrspace(4) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true) +// AMDGCNSPIRV-NEXT: ret i64 [[TMP0]] +// extern "C" __device__ long long test_llabs(long x) { return llabs(x); } @@ -272,6 +465,11 @@ extern "C" __device__ long long test_llabs(long x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR12:[0-9]+]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_acosf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_acos_f32(float noundef [[X:%.*]]) #[[ATTR12:[0-9]+]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_acosf(float x) { return acosf(x); } @@ -291,6 +489,11 @@ extern "C" __device__ float test_acosf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_acos( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_acos_f64(double noundef [[X:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_acos(double x) { return acos(x); } @@ -310,6 +513,11 @@ extern "C" __device__ double test_acos(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR13:[0-9]+]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_acoshf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_acosh_f32(float noundef [[X:%.*]]) #[[ATTR13:[0-9]+]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_acoshf(float x) { return acoshf(x); } @@ -329,6 +537,11 @@ extern "C" __device__ float test_acoshf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_acosh( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_acosh_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_acosh(double x) { return acosh(x); } @@ -348,6 +561,11 @@ extern "C" __device__ double test_acosh(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_asinf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_asin_f32(float noundef [[X:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_asinf(float x) { return asinf(x); } @@ -367,6 +585,11 @@ extern "C" __device__ float test_asinf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_asin( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_asin_f64(double noundef [[X:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_asin(double x) { return asin(x); @@ -387,6 +610,11 @@ extern "C" __device__ double test_asin(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_asinhf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_asinh_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_asinhf(float x) { return asinhf(x); } @@ -406,6 +634,11 @@ extern "C" __device__ float test_asinhf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_asinh( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_asinh_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_asinh(double x) { return asinh(x); } @@ -425,6 +658,11 @@ extern "C" __device__ double test_asinh(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_atan2f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_atan2_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_atan2f(float x, float y) { return atan2f(x, y); } @@ -444,6 +682,11 @@ extern "C" __device__ float test_atan2f(float x, float y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_atan2( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_atan2_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_atan2(double x, double y) { return atan2(x, y); } @@ -463,6 +706,11 @@ extern "C" __device__ double test_atan2(double x, double y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_atanf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_atan_f32(float noundef [[X:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_atanf(float x) { return atanf(x); } @@ -482,6 +730,11 @@ extern "C" __device__ float test_atanf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_atan( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_atan_f64(double noundef [[X:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_atan(double x) { return atan(x); } @@ -501,6 +754,11 @@ extern "C" __device__ double test_atan(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_atanhf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_atanh_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_atanhf(float x) { return atanhf(x); } @@ -520,6 +778,11 @@ extern "C" __device__ float test_atanhf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_atanh( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_atanh_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_atanh(double x) { return atanh(x); } @@ -539,6 +802,11 @@ extern "C" __device__ double test_atanh(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_cbrtf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_cbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_cbrtf(float x) { return cbrtf(x); } @@ -558,6 +826,11 @@ extern "C" __device__ float test_cbrtf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_cbrt( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_cbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_cbrt(double x) { return cbrt(x); } @@ -577,6 +850,11 @@ extern "C" __device__ double test_cbrt(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.ceil.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_ceilf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.ceil.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_ceilf(float x) { return ceilf(x); } @@ -596,6 +874,11 @@ extern "C" __device__ float test_ceilf(float x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.ceil.f64(double [[X:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_ceil( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.ceil.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_ceil(double x) { return ceil(x); } @@ -615,6 +898,11 @@ extern "C" __device__ double test_ceil(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_copysignf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_copysignf(float x, float y) { return copysignf(x, y); } @@ -634,6 +922,11 @@ extern "C" __device__ float test_copysignf(float x, float y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_copysign( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_copysign(double x, double y) { return copysign(x, y); } @@ -653,6 +946,11 @@ extern "C" __device__ double test_copysign(double x, double y) { // APPROX-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]] // APPROX-NEXT: ret float [[CALL_I_I]] // +// AMDGCNSPIRV-LABEL: @test_cosf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR14:[0-9]+]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_cosf(float x) { return cosf(x); } @@ -672,6 +970,11 @@ extern "C" __device__ float test_cosf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_cos( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_cos_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_cos(double x) { return cos(x); } @@ -691,6 +994,11 @@ extern "C" __device__ double test_cos(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_coshf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_cosh_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_coshf(float x) { return coshf(x); } @@ -710,6 +1018,11 @@ extern "C" __device__ float test_coshf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_cosh( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_cosh_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_cosh(double x) { return cosh(x); } @@ -729,6 +1042,11 @@ extern "C" __device__ double test_cosh(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_cospif( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_cospi_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_cospif(float x) { return cospif(x); } @@ -748,6 +1066,11 @@ extern "C" __device__ float test_cospif(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_cospi( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_cospi_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_cospi(double x) { return cospi(x); } @@ -767,6 +1090,11 @@ extern "C" __device__ double test_cospi(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_cyl_bessel_i0f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_i0_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_cyl_bessel_i0f(float x) { return cyl_bessel_i0f(x); } @@ -786,11 +1114,15 @@ extern "C" __device__ float test_cyl_bessel_i0f(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_cyl_bessel_i0( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_i0_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_cyl_bessel_i0(double x) { return cyl_bessel_i0(x); } -// // DEFAULT-LABEL: @test_cyl_bessel_i1f( // DEFAULT-NEXT: entry: // DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR14]] @@ -806,6 +1138,11 @@ extern "C" __device__ double test_cyl_bessel_i0(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_cyl_bessel_i1f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_i1_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_cyl_bessel_i1f(float x) { return cyl_bessel_i1f(x); } @@ -825,6 +1162,11 @@ extern "C" __device__ float test_cyl_bessel_i1f(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_cyl_bessel_i1( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_i1_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_cyl_bessel_i1(double x) { return cyl_bessel_i1(x); } @@ -844,6 +1186,11 @@ extern "C" __device__ double test_cyl_bessel_i1(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_erfcf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_erfc_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_erfcf(float x) { return erfcf(x); } @@ -863,6 +1210,11 @@ extern "C" __device__ float test_erfcf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_erfc( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_erfc_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_erfc(double x) { return erfc(x); } @@ -882,6 +1234,11 @@ extern "C" __device__ double test_erfc(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_erfinvf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_erfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_erfinvf(float x) { return erfinvf(x); } @@ -901,6 +1258,11 @@ extern "C" __device__ float test_erfinvf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_erfinv( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_erfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_erfinv(double x) { return erfinv(x); } @@ -920,6 +1282,11 @@ extern "C" __device__ double test_erfinv(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_exp10f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_exp10_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_exp10f(float x) { return exp10f(x); } @@ -939,6 +1306,11 @@ extern "C" __device__ float test_exp10f(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_exp10( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_exp10_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_exp10(double x) { return exp10(x); } @@ -958,6 +1330,11 @@ extern "C" __device__ double test_exp10(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.exp2.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_exp2f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.exp2.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_exp2f(float x) { return exp2f(x); } @@ -977,6 +1354,11 @@ extern "C" __device__ float test_exp2f(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_exp2( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_exp2_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_exp2(double x) { return exp2(x); } @@ -996,6 +1378,11 @@ extern "C" __device__ double test_exp2(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.exp.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_expf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.exp.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_expf(float x) { return expf(x); } @@ -1015,6 +1402,11 @@ extern "C" __device__ float test_expf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_exp( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_exp_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_exp(double x) { return exp(x); } @@ -1034,6 +1426,11 @@ extern "C" __device__ double test_exp(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_expm1f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_expm1_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_expm1f(float x) { return expm1f(x); } @@ -1053,6 +1450,11 @@ extern "C" __device__ float test_expm1f(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_expm1( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_expm1_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_expm1(double x) { return expm1(x); } @@ -1072,6 +1474,11 @@ extern "C" __device__ double test_expm1(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.fabs.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_fabsf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.fabs.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_fabsf(float x) { return fabsf(x); } @@ -1091,6 +1498,11 @@ extern "C" __device__ float test_fabsf(float x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.fabs.f64(double [[X:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_fabs( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.fabs.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_fabs(double x) { return fabs(x); } @@ -1110,6 +1522,11 @@ extern "C" __device__ double test_fabs(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_fdimf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_fdim_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_fdimf(float x, float y) { return fdimf(x, y); } @@ -1129,6 +1546,11 @@ extern "C" __device__ float test_fdimf(float x, float y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_fdim( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_fdim_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_fdim(double x, double y) { return fdim(x, y); } @@ -1148,6 +1570,11 @@ extern "C" __device__ double test_fdim(double x, double y) { // APPROX-NEXT: [[DIV_I:%.*]] = fdiv contract float [[X:%.*]], [[Y:%.*]] // APPROX-NEXT: ret float [[DIV_I]] // +// AMDGCNSPIRV-LABEL: @test_fdividef( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[DIV_I:%.*]] = fdiv contract float [[X:%.*]], [[Y:%.*]] +// AMDGCNSPIRV-NEXT: ret float [[DIV_I]] +// extern "C" __device__ float test_fdividef(float x, float y) { return fdividef(x, y); } @@ -1167,6 +1594,11 @@ extern "C" __device__ float test_fdividef(float x, float y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.floor.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_floorf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.floor.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_floorf(float x) { return floorf(x); } @@ -1186,6 +1618,11 @@ extern "C" __device__ float test_floorf(float x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.floor.f64(double [[X:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_floor( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.floor.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_floor(double x) { return floor(x); } @@ -1205,6 +1642,11 @@ extern "C" __device__ double test_floor(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_fmaf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_fmaf(float x, float y, float z) { return fmaf(x, y, z); } @@ -1224,6 +1666,11 @@ extern "C" __device__ float test_fmaf(float x, float y, float z) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_fma( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_fma(double x, double y, double z) { return fma(x, y, z); } @@ -1243,6 +1690,11 @@ extern "C" __device__ double test_fma(double x, double y, double z) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_fma_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_fma_rn(double x, double y, double z) { return __fma_rn(x, y, z); } @@ -1262,6 +1714,11 @@ extern "C" __device__ double test_fma_rn(double x, double y, double z) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_fmaxf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_fmaxf(float x, float y) { return fmaxf(x, y); } @@ -1281,6 +1738,11 @@ extern "C" __device__ float test_fmaxf(float x, float y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_fmax( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_fmax(double x, double y) { return fmax(x, y); } @@ -1300,6 +1762,11 @@ extern "C" __device__ double test_fmax(double x, double y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_fminf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_fminf(float x, float y) { return fminf(x, y); } @@ -1319,6 +1786,11 @@ extern "C" __device__ float test_fminf(float x, float y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_fmin( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_fmin(double x, double y) { return fmin(x, y); } @@ -1338,6 +1810,11 @@ extern "C" __device__ double test_fmin(double x, double y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_fmodf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_fmod_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_fmodf(float x, float y) { return fmodf(x, y); } @@ -1357,6 +1834,11 @@ extern "C" __device__ float test_fmodf(float x, float y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_fmod( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_fmod_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_fmod(double x, double y) { return fmod(x, y); } @@ -1369,6 +1851,14 @@ extern "C" __device__ double test_fmod(double x, double y) { // CHECK-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0 // CHECK-NEXT: ret float [[TMP2]] // +// AMDGCNSPIRV-LABEL: @test_frexpf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call addrspace(4) { float, i32 } @llvm.frexp.f32.i32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1 +// AMDGCNSPIRV-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[Y:%.*]], align 4, !tbaa [[TBAA13:![0-9]+]] +// AMDGCNSPIRV-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0 +// AMDGCNSPIRV-NEXT: ret float [[TMP2]] +// extern "C" __device__ float test_frexpf(float x, int* y) { return frexpf(x, y); } @@ -1381,6 +1871,14 @@ extern "C" __device__ float test_frexpf(float x, int* y) { // CHECK-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0 // CHECK-NEXT: ret double [[TMP2]] // +// AMDGCNSPIRV-LABEL: @test_frexp( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call addrspace(4) { double, i32 } @llvm.frexp.f64.i32(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 +// AMDGCNSPIRV-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[Y:%.*]], align 4, !tbaa [[TBAA13]] +// AMDGCNSPIRV-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0 +// AMDGCNSPIRV-NEXT: ret double [[TMP2]] +// extern "C" __device__ double test_frexp(double x, int* y) { return frexp(x, y); } @@ -1400,6 +1898,11 @@ extern "C" __device__ double test_frexp(double x, int* y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_hypotf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_hypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_hypotf(float x, float y) { return hypotf(x, y); } @@ -1419,6 +1922,11 @@ extern "C" __device__ float test_hypotf(float x, float y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_hypot( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_hypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_hypot(double x, double y) { return hypot(x, y); } @@ -1438,6 +1946,11 @@ extern "C" __device__ double test_hypot(double x, double y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret i32 [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_ilogbf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call spir_func noundef addrspace(4) i32 @__ocml_ilogb_f32(float noundef [[X:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret i32 [[CALL_I]] +// extern "C" __device__ int test_ilogbf(float x) { return ilogbf(x); } @@ -1457,6 +1970,11 @@ extern "C" __device__ int test_ilogbf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call noundef i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret i32 [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_ilogb( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call spir_func noundef addrspace(4) i32 @__ocml_ilogb_f64(double noundef [[X:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret i32 [[CALL_I]] +// extern "C" __device__ int test_ilogb(double x) { return ilogb(x); } @@ -1479,6 +1997,13 @@ extern "C" __device__ int test_ilogb(double x) { // APPROX-NEXT: [[CONV:%.*]] = zext i1 [[TMP1]] to i32 // APPROX-NEXT: ret i32 [[CONV]] // +// AMDGCNSPIRV-LABEL: @test___finitef( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call addrspace(4) float @llvm.fabs.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = fcmp one float [[TMP0]], 0x7FF0000000000000 +// AMDGCNSPIRV-NEXT: [[CONV:%.*]] = zext i1 [[TMP1]] to i32 +// AMDGCNSPIRV-NEXT: ret i32 [[CONV]] +// extern "C" __device__ BOOL_TYPE test___finitef(float x) { return __finitef(x); } @@ -1501,6 +2026,13 @@ extern "C" __device__ BOOL_TYPE test___finitef(float x) { // APPROX-NEXT: [[CONV:%.*]] = zext i1 [[TMP1]] to i32 // APPROX-NEXT: ret i32 [[CONV]] // +// AMDGCNSPIRV-LABEL: @test___finite( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call addrspace(4) double @llvm.fabs.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = fcmp one double [[TMP0]], 0x7FF0000000000000 +// AMDGCNSPIRV-NEXT: [[CONV:%.*]] = zext i1 [[TMP1]] to i32 +// AMDGCNSPIRV-NEXT: ret i32 [[CONV]] +// extern "C" __device__ BOOL_TYPE test___finite(double x) { return __finite(x); } @@ -1523,6 +2055,13 @@ extern "C" __device__ BOOL_TYPE test___finite(double x) { // APPROX-NEXT: [[CONV:%.*]] = zext i1 [[TMP1]] to i32 // APPROX-NEXT: ret i32 [[CONV]] // +// AMDGCNSPIRV-LABEL: @test___isinff( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call addrspace(4) float @llvm.fabs.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = fcmp oeq float [[TMP0]], 0x7FF0000000000000 +// AMDGCNSPIRV-NEXT: [[CONV:%.*]] = zext i1 [[TMP1]] to i32 +// AMDGCNSPIRV-NEXT: ret i32 [[CONV]] +// extern "C" __device__ BOOL_TYPE test___isinff(float x) { return __isinff(x); } @@ -1545,6 +2084,13 @@ extern "C" __device__ BOOL_TYPE test___isinff(float x) { // APPROX-NEXT: [[CONV:%.*]] = zext i1 [[TMP1]] to i32 // APPROX-NEXT: ret i32 [[CONV]] // +// AMDGCNSPIRV-LABEL: @test___isinf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call addrspace(4) double @llvm.fabs.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = fcmp oeq double [[TMP0]], 0x7FF0000000000000 +// AMDGCNSPIRV-NEXT: [[CONV:%.*]] = zext i1 [[TMP1]] to i32 +// AMDGCNSPIRV-NEXT: ret i32 [[CONV]] +// extern "C" __device__ BOOL_TYPE test___isinf(double x) { return __isinf(x); } @@ -1565,6 +2111,12 @@ extern "C" __device__ BOOL_TYPE test___isinf(double x) { // APPROX-NEXT: [[CONV:%.*]] = zext i1 [[TMP0]] to i32 // APPROX-NEXT: ret i32 [[CONV]] // +// AMDGCNSPIRV-LABEL: @test___isnanf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = fcmp uno float [[X:%.*]], 0.000000e+00 +// AMDGCNSPIRV-NEXT: [[CONV:%.*]] = zext i1 [[TMP0]] to i32 +// AMDGCNSPIRV-NEXT: ret i32 [[CONV]] +// extern "C" __device__ BOOL_TYPE test___isnanf(float x) { return __isnanf(x); } @@ -1585,6 +2137,12 @@ extern "C" __device__ BOOL_TYPE test___isnanf(float x) { // APPROX-NEXT: [[CONV:%.*]] = zext i1 [[TMP0]] to i32 // APPROX-NEXT: ret i32 [[CONV]] // +// AMDGCNSPIRV-LABEL: @test___isnan( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = fcmp uno double [[X:%.*]], 0.000000e+00 +// AMDGCNSPIRV-NEXT: [[CONV:%.*]] = zext i1 [[TMP0]] to i32 +// AMDGCNSPIRV-NEXT: ret i32 [[CONV]] +// extern "C" __device__ BOOL_TYPE test___isnan(double x) { return __isnan(x); } @@ -1604,6 +2162,11 @@ extern "C" __device__ BOOL_TYPE test___isnan(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_j0f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j0_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_j0f(float x) { return j0f(x); } @@ -1623,6 +2186,11 @@ extern "C" __device__ float test_j0f(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_j0( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j0_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_j0(double x) { return j0(x); } @@ -1642,6 +2210,11 @@ extern "C" __device__ double test_j0(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_j1f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j1_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_j1f(float x) { return j1f(x); } @@ -1661,6 +2234,11 @@ extern "C" __device__ float test_j1f(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_j1( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j1_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_j1(double x) { return j1(x); } @@ -1764,6 +2342,39 @@ extern "C" __device__ double test_j1(double x) { // APPROX-NEXT: [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ] // APPROX-NEXT: ret float [[RETVAL_0_I]] // +// AMDGCNSPIRV-LABEL: @test_jnf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [ +// AMDGCNSPIRV-NEXT: i32 0, label [[IF_THEN_I:%.*]] +// AMDGCNSPIRV-NEXT: i32 1, label [[IF_THEN2_I:%.*]] +// AMDGCNSPIRV-NEXT: ] +// AMDGCNSPIRV: if.then.i: +// AMDGCNSPIRV-NEXT: [[CALL_I_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: br label [[_ZL3JNFIF_EXIT:%.*]] +// AMDGCNSPIRV: if.then2.i: +// AMDGCNSPIRV-NEXT: [[CALL_I20_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: br label [[_ZL3JNFIF_EXIT]] +// AMDGCNSPIRV: if.end4.i: +// AMDGCNSPIRV-NEXT: [[CALL_I21_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[CALL_I22_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1 +// AMDGCNSPIRV-NEXT: br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]] +// AMDGCNSPIRV: for.body.i: +// AMDGCNSPIRV-NEXT: [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ] +// AMDGCNSPIRV-NEXT: [[__X1_025_I:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ] +// AMDGCNSPIRV-NEXT: [[__X0_024_I:%.*]] = phi float [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1 +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float +// AMDGCNSPIRV-NEXT: [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]] +// AMDGCNSPIRV-NEXT: [[MUL8_I:%.*]] = fmul contract float [[__X1_025_I]], [[DIV_I]] +// AMDGCNSPIRV-NEXT: [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_024_I]] +// AMDGCNSPIRV-NEXT: [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1 +// AMDGCNSPIRV-NEXT: [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]] +// AMDGCNSPIRV-NEXT: br i1 [[EXITCOND_NOT_I]], label [[_ZL3JNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]] +// AMDGCNSPIRV: _ZL3jnfif.exit: +// AMDGCNSPIRV-NEXT: [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ] +// AMDGCNSPIRV-NEXT: ret float [[RETVAL_0_I]] +// extern "C" __device__ float test_jnf(int x, float y) { return jnf(x, y); } @@ -1867,6 +2478,39 @@ extern "C" __device__ float test_jnf(int x, float y) { // APPROX-NEXT: [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ] // APPROX-NEXT: ret double [[RETVAL_0_I]] // +// AMDGCNSPIRV-LABEL: @test_jn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [ +// AMDGCNSPIRV-NEXT: i32 0, label [[IF_THEN_I:%.*]] +// AMDGCNSPIRV-NEXT: i32 1, label [[IF_THEN2_I:%.*]] +// AMDGCNSPIRV-NEXT: ] +// AMDGCNSPIRV: if.then.i: +// AMDGCNSPIRV-NEXT: [[CALL_I_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: br label [[_ZL2JNID_EXIT:%.*]] +// AMDGCNSPIRV: if.then2.i: +// AMDGCNSPIRV-NEXT: [[CALL_I20_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: br label [[_ZL2JNID_EXIT]] +// AMDGCNSPIRV: if.end4.i: +// AMDGCNSPIRV-NEXT: [[CALL_I21_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[CALL_I22_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1 +// AMDGCNSPIRV-NEXT: br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]] +// AMDGCNSPIRV: for.body.i: +// AMDGCNSPIRV-NEXT: [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ] +// AMDGCNSPIRV-NEXT: [[__X1_025_I:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ] +// AMDGCNSPIRV-NEXT: [[__X0_024_I:%.*]] = phi double [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1 +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double +// AMDGCNSPIRV-NEXT: [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]] +// AMDGCNSPIRV-NEXT: [[MUL8_I:%.*]] = fmul contract double [[__X1_025_I]], [[DIV_I]] +// AMDGCNSPIRV-NEXT: [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_024_I]] +// AMDGCNSPIRV-NEXT: [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1 +// AMDGCNSPIRV-NEXT: [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]] +// AMDGCNSPIRV-NEXT: br i1 [[EXITCOND_NOT_I]], label [[_ZL2JNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP16:![0-9]+]] +// AMDGCNSPIRV: _ZL2jnid.exit: +// AMDGCNSPIRV-NEXT: [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ] +// AMDGCNSPIRV-NEXT: ret double [[RETVAL_0_I]] +// extern "C" __device__ double test_jn(int x, double y) { return jn(x, y); } @@ -1886,6 +2530,11 @@ extern "C" __device__ double test_jn(int x, double y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_ldexpf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_ldexpf(float x, int y) { return ldexpf(x, y); } @@ -1905,6 +2554,11 @@ extern "C" __device__ float test_ldexpf(float x, int y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_ldexp( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_ldexp(double x, int y) { return ldexp(x, y); } @@ -1924,6 +2578,11 @@ extern "C" __device__ double test_ldexp(double x, int y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_lgammaf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_lgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_lgammaf(float x) { return lgammaf(x); } @@ -1943,6 +2602,11 @@ extern "C" __device__ float test_lgammaf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_lgamma( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_lgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_lgamma(double x) { return lgamma(x); } @@ -1965,6 +2629,12 @@ extern "C" __device__ double test_lgamma(double x) { // APPROX-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 // APPROX-NEXT: ret i64 [[CONV_I]] // +// AMDGCNSPIRV-LABEL: @test_llrintf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.rint.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 +// AMDGCNSPIRV-NEXT: ret i64 [[CONV_I]] +// extern "C" __device__ long long int test_llrintf(float x) { return llrintf(x); } @@ -1987,6 +2657,12 @@ extern "C" __device__ long long int test_llrintf(float x) { // APPROX-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 // APPROX-NEXT: ret i64 [[CONV_I]] // +// AMDGCNSPIRV-LABEL: @test_llrint( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract addrspace(4) double @llvm.rint.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 +// AMDGCNSPIRV-NEXT: ret i64 [[CONV_I]] +// extern "C" __device__ long long int test_llrint(double x) { return llrint(x); } @@ -2009,6 +2685,12 @@ extern "C" __device__ long long int test_llrint(double x) { // APPROX-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 // APPROX-NEXT: ret i64 [[CONV_I]] // +// AMDGCNSPIRV-LABEL: @test_llroundf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.round.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 +// AMDGCNSPIRV-NEXT: ret i64 [[CONV_I]] +// extern "C" __device__ long long int test_llroundf(float x) { return llroundf(x); } @@ -2031,6 +2713,12 @@ extern "C" __device__ long long int test_llroundf(float x) { // APPROX-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 // APPROX-NEXT: ret i64 [[CONV_I]] // +// AMDGCNSPIRV-LABEL: @test_llround( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract addrspace(4) double @llvm.round.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 +// AMDGCNSPIRV-NEXT: ret i64 [[CONV_I]] +// extern "C" __device__ long long int test_llround(double x) { return llround(x); } @@ -2050,6 +2738,11 @@ extern "C" __device__ long long int test_llround(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_log10f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.log10.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_log10f(float x) { return log10f(x); } @@ -2069,6 +2762,11 @@ extern "C" __device__ float test_log10f(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_log10( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_log10_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_log10(double x) { return log10(x); } @@ -2088,6 +2786,11 @@ extern "C" __device__ double test_log10(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_log1pf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_log1p_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_log1pf(float x) { return log1pf(x); } @@ -2107,6 +2810,11 @@ extern "C" __device__ float test_log1pf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_log1p( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_log1p_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_log1p(double x) { return log1p(x); } @@ -2126,6 +2834,11 @@ extern "C" __device__ double test_log1p(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.log.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_log2f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_log2_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_log2f(float x) { return log2f(x); } @@ -2145,6 +2858,11 @@ extern "C" __device__ float test_log2f(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_log2( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_log2_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_log2(double x) { return log2(x); } @@ -2164,6 +2882,11 @@ extern "C" __device__ double test_log2(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_logbf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_logb_f32(float noundef [[X:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_logbf(float x) { return logbf(x); } @@ -2183,6 +2906,11 @@ extern "C" __device__ float test_logbf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_logb( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_logb_f64(double noundef [[X:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_logb(double x) { return logb(x); } @@ -2202,6 +2930,11 @@ extern "C" __device__ double test_logb(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_logf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_log_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_logf(float x) { return logf(x); } @@ -2224,6 +2957,12 @@ extern "C" __device__ float test_logf(float x) { // APPROX-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 // APPROX-NEXT: ret i64 [[CONV_I]] // +// AMDGCNSPIRV-LABEL: @test_lrintf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.rint.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 +// AMDGCNSPIRV-NEXT: ret i64 [[CONV_I]] +// extern "C" __device__ long int test_lrintf(float x) { return lrintf(x); } @@ -2246,6 +2985,12 @@ extern "C" __device__ long int test_lrintf(float x) { // APPROX-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 // APPROX-NEXT: ret i64 [[CONV_I]] // +// AMDGCNSPIRV-LABEL: @test_lrint( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract addrspace(4) double @llvm.rint.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 +// AMDGCNSPIRV-NEXT: ret i64 [[CONV_I]] +// extern "C" __device__ long int test_lrint(double x) { return lrint(x); } @@ -2268,6 +3013,12 @@ extern "C" __device__ long int test_lrint(double x) { // APPROX-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 // APPROX-NEXT: ret i64 [[CONV_I]] // +// AMDGCNSPIRV-LABEL: @test_lroundf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.round.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 +// AMDGCNSPIRV-NEXT: ret i64 [[CONV_I]] +// extern "C" __device__ long int test_lroundf(float x) { return lroundf(x); } @@ -2290,6 +3041,12 @@ extern "C" __device__ long int test_lroundf(float x) { // APPROX-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 // APPROX-NEXT: ret i64 [[CONV_I]] // +// AMDGCNSPIRV-LABEL: @test_lround( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract addrspace(4) double @llvm.round.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 +// AMDGCNSPIRV-NEXT: ret i64 [[CONV_I]] +// extern "C" __device__ long int test_lround(double x) { return lround(x); } @@ -2324,6 +3081,17 @@ extern "C" __device__ long int test_lround(double x) { // APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_modff( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[__TMP_I:%.*]] = alloca float, align 4 +// AMDGCNSPIRV-NEXT: [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4) +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[__TMP_I]]) #[[ATTR15:[0-9]+]] +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) float @__ocml_modf_f32(float noundef [[X:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[TBAA17:![0-9]+]] +// AMDGCNSPIRV-NEXT: store float [[TMP0]], ptr addrspace(4) [[Y:%.*]], align 4, !tbaa [[TBAA17]] +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_modff(float x, float* y) { return modff(x, y); } @@ -2358,6 +3126,17 @@ extern "C" __device__ float test_modff(float x, float* y) { // APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_modf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[__TMP_I:%.*]] = alloca double, align 8 +// AMDGCNSPIRV-NEXT: [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4) +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) double @__ocml_modf_f64(double noundef [[X:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(4) [[__TMP_ASCAST_I]], align 8, !tbaa [[TBAA19:![0-9]+]] +// AMDGCNSPIRV-NEXT: store double [[TMP0]], ptr addrspace(4) [[Y:%.*]], align 8, !tbaa [[TBAA19]] +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_modf(double x, double* y) { return modf(x, y); } @@ -2556,6 +3335,93 @@ extern "C" __device__ double test_modf(double x, double* y) { // APPROX-NEXT: [[TMP10:%.*]] = bitcast i32 [[BF_SET9_I]] to float // APPROX-NEXT: ret float [[TMP10]] // +// AMDGCNSPIRV-LABEL: @test_nanf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i8, ptr addrspace(4) [[TAG:%.*]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I27_I_I:%.*]] +// AMDGCNSPIRV: if.then.i.i: +// AMDGCNSPIRV-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TAG]], i64 1 +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I14_I_I:%.*]] [ +// AMDGCNSPIRV-NEXT: i8 120, label [[WHILE_COND_I_I_I_PREHEADER:%.*]] +// AMDGCNSPIRV-NEXT: i8 88, label [[WHILE_COND_I_I_I_PREHEADER]] +// AMDGCNSPIRV-NEXT: ] +// AMDGCNSPIRV: while.cond.i.i.i.preheader: +// AMDGCNSPIRV-NEXT: br label [[WHILE_COND_I_I_I:%.*]] +// AMDGCNSPIRV: while.cond.i.i.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I_I_I_PREHEADER]] ] +// AMDGCNSPIRV-NEXT: [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ] +// AMDGCNSPIRV-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP2]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I_I_I]], label [[_ZL4NANFPKC_EXIT:%.*]], label [[WHILE_BODY_I_I_I:%.*]] +// AMDGCNSPIRV: while.body.i.i.i: +// AMDGCNSPIRV-NEXT: [[TMP3:%.*]] = add i8 [[TMP2]], -48 +// AMDGCNSPIRV-NEXT: [[OR_COND_I_I_I:%.*]] = icmp ult i8 [[TMP3]], 10 +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND_I_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]] +// AMDGCNSPIRV: if.else.i.i.i: +// AMDGCNSPIRV-NEXT: [[TMP4:%.*]] = add i8 [[TMP2]], -97 +// AMDGCNSPIRV-NEXT: [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6 +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]] +// AMDGCNSPIRV: if.else17.i.i.i: +// AMDGCNSPIRV-NEXT: [[TMP5:%.*]] = add i8 [[TMP2]], -65 +// AMDGCNSPIRV-NEXT: [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6 +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I_I_I]] +// AMDGCNSPIRV: if.end31.i.i.i: +// AMDGCNSPIRV-NEXT: [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ] +// AMDGCNSPIRV-NEXT: [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 4 +// AMDGCNSPIRV-NEXT: [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 +// AMDGCNSPIRV-NEXT: [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]] +// AMDGCNSPIRV-NEXT: [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]] +// AMDGCNSPIRV-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], i64 1 +// AMDGCNSPIRV-NEXT: br label [[CLEANUP_I_I_I]] +// AMDGCNSPIRV: cleanup.i.i.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_I_I_I]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[IF_ELSE17_I_I_I]] ] +// AMDGCNSPIRV-NEXT: [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I_I_I]], [[IF_ELSE17_I_I_I]] ] +// AMDGCNSPIRV-NEXT: [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ] +// AMDGCNSPIRV-NEXT: br i1 [[COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP12]] +// AMDGCNSPIRV: while.cond.i14.i.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I24_I_I:%.*]], [[WHILE_BODY_I18_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ] +// AMDGCNSPIRV-NEXT: [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I25_I_I:%.*]], [[WHILE_BODY_I18_I_I]] ], [ 0, [[IF_THEN_I_I]] ] +// AMDGCNSPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP6]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I17_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I18_I_I]] +// AMDGCNSPIRV: while.body.i18.i.i: +// AMDGCNSPIRV-NEXT: [[TMP7:%.*]] = and i8 [[TMP6]], -8 +// AMDGCNSPIRV-NEXT: [[OR_COND_I19_I_I:%.*]] = icmp eq i8 [[TMP7]], 48 +// AMDGCNSPIRV-NEXT: [[MUL_I20_I_I:%.*]] = shl i64 [[__R_0_I16_I_I]], 3 +// AMDGCNSPIRV-NEXT: [[CONV5_I21_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64 +// AMDGCNSPIRV-NEXT: [[ADD_I22_I_I:%.*]] = add i64 [[MUL_I20_I_I]], -48 +// AMDGCNSPIRV-NEXT: [[SUB_I23_I_I:%.*]] = add i64 [[ADD_I22_I_I]], [[CONV5_I21_I_I]] +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_IDX_I_I_I:%.*]] = zext i1 [[OR_COND_I19_I_I]] to i64 +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_I24_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I_I]], i64 [[__TAGP_ADDR_1_IDX_I_I_I]] +// AMDGCNSPIRV-NEXT: [[__R_1_I25_I_I]] = select i1 [[OR_COND_I19_I_I]], i64 [[SUB_I23_I_I]], i64 [[__R_0_I16_I_I]] +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP8]] +// AMDGCNSPIRV: while.cond.i27.i.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_0_I28_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I38_I_I:%.*]], [[WHILE_BODY_I31_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ] +// AMDGCNSPIRV-NEXT: [[__R_0_I29_I_I:%.*]] = phi i64 [ [[__R_1_I39_I_I:%.*]], [[WHILE_BODY_I31_I_I]] ], [ 0, [[ENTRY]] ] +// AMDGCNSPIRV-NEXT: [[TMP8:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I28_I_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I30_I_I:%.*]] = icmp eq i8 [[TMP8]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I30_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I31_I_I]] +// AMDGCNSPIRV: while.body.i31.i.i: +// AMDGCNSPIRV-NEXT: [[TMP9:%.*]] = add i8 [[TMP8]], -48 +// AMDGCNSPIRV-NEXT: [[OR_COND_I32_I_I:%.*]] = icmp ult i8 [[TMP9]], 10 +// AMDGCNSPIRV-NEXT: [[MUL_I33_I_I:%.*]] = mul i64 [[__R_0_I29_I_I]], 10 +// AMDGCNSPIRV-NEXT: [[CONV5_I34_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64 +// AMDGCNSPIRV-NEXT: [[ADD_I35_I_I:%.*]] = add i64 [[MUL_I33_I_I]], -48 +// AMDGCNSPIRV-NEXT: [[SUB_I36_I_I:%.*]] = add i64 [[ADD_I35_I_I]], [[CONV5_I34_I_I]] +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_IDX_I37_I_I:%.*]] = zext i1 [[OR_COND_I32_I_I]] to i64 +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_I38_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I28_I_I]], i64 [[__TAGP_ADDR_1_IDX_I37_I_I]] +// AMDGCNSPIRV-NEXT: [[__R_1_I39_I_I]] = select i1 [[OR_COND_I32_I_I]], i64 [[SUB_I36_I_I]], i64 [[__R_0_I29_I_I]] +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND_I32_I_I]], label [[WHILE_COND_I27_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP11]] +// AMDGCNSPIRV: _ZL4nanfPKc.exit: +// AMDGCNSPIRV-NEXT: [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_I18_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ], [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[WHILE_BODY_I31_I_I]] ], [ [[__R_0_I29_I_I]], [[WHILE_COND_I27_I_I]] ] +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32 +// AMDGCNSPIRV-NEXT: [[BF_VALUE_I:%.*]] = and i32 [[CONV_I]], 4194303 +// AMDGCNSPIRV-NEXT: [[BF_SET9_I:%.*]] = or disjoint i32 [[BF_VALUE_I]], 2143289344 +// AMDGCNSPIRV-NEXT: [[TMP10:%.*]] = bitcast i32 [[BF_SET9_I]] to float +// AMDGCNSPIRV-NEXT: ret float [[TMP10]] +// extern "C" __device__ float test_nanf(const char *tag) { return nanf(tag); } @@ -2752,6 +3618,92 @@ extern "C" __device__ float test_nanf(const char *tag) { // APPROX-NEXT: [[TMP10:%.*]] = bitcast i64 [[BF_SET9_I]] to double // APPROX-NEXT: ret double [[TMP10]] // +// AMDGCNSPIRV-LABEL: @test_nan( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i8, ptr addrspace(4) [[TAG:%.*]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I27_I_I:%.*]] +// AMDGCNSPIRV: if.then.i.i: +// AMDGCNSPIRV-NEXT: [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TAG]], i64 1 +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: switch i8 [[TMP1]], label [[WHILE_COND_I14_I_I:%.*]] [ +// AMDGCNSPIRV-NEXT: i8 120, label [[WHILE_COND_I_I_I_PREHEADER:%.*]] +// AMDGCNSPIRV-NEXT: i8 88, label [[WHILE_COND_I_I_I_PREHEADER]] +// AMDGCNSPIRV-NEXT: ] +// AMDGCNSPIRV: while.cond.i.i.i.preheader: +// AMDGCNSPIRV-NEXT: br label [[WHILE_COND_I_I_I:%.*]] +// AMDGCNSPIRV: while.cond.i.i.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I_I_I_PREHEADER]] ] +// AMDGCNSPIRV-NEXT: [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ] +// AMDGCNSPIRV-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP2]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I_I_I]], label [[_ZL3NANPKC_EXIT:%.*]], label [[WHILE_BODY_I_I_I:%.*]] +// AMDGCNSPIRV: while.body.i.i.i: +// AMDGCNSPIRV-NEXT: [[TMP3:%.*]] = add i8 [[TMP2]], -48 +// AMDGCNSPIRV-NEXT: [[OR_COND_I_I_I:%.*]] = icmp ult i8 [[TMP3]], 10 +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND_I_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]] +// AMDGCNSPIRV: if.else.i.i.i: +// AMDGCNSPIRV-NEXT: [[TMP4:%.*]] = add i8 [[TMP2]], -97 +// AMDGCNSPIRV-NEXT: [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6 +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]] +// AMDGCNSPIRV: if.else17.i.i.i: +// AMDGCNSPIRV-NEXT: [[TMP5:%.*]] = add i8 [[TMP2]], -65 +// AMDGCNSPIRV-NEXT: [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6 +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I_I_I]] +// AMDGCNSPIRV: if.end31.i.i.i: +// AMDGCNSPIRV-NEXT: [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ] +// AMDGCNSPIRV-NEXT: [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 4 +// AMDGCNSPIRV-NEXT: [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64 +// AMDGCNSPIRV-NEXT: [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]] +// AMDGCNSPIRV-NEXT: [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]] +// AMDGCNSPIRV-NEXT: [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], i64 1 +// AMDGCNSPIRV-NEXT: br label [[CLEANUP_I_I_I]] +// AMDGCNSPIRV: cleanup.i.i.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_I_I_I]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[IF_ELSE17_I_I_I]] ] +// AMDGCNSPIRV-NEXT: [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I_I_I]], [[IF_ELSE17_I_I_I]] ] +// AMDGCNSPIRV-NEXT: [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ] +// AMDGCNSPIRV-NEXT: br i1 [[COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP12]] +// AMDGCNSPIRV: while.cond.i14.i.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I24_I_I:%.*]], [[WHILE_BODY_I18_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ] +// AMDGCNSPIRV-NEXT: [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I25_I_I:%.*]], [[WHILE_BODY_I18_I_I]] ], [ 0, [[IF_THEN_I_I]] ] +// AMDGCNSPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP6]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I17_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I18_I_I]] +// AMDGCNSPIRV: while.body.i18.i.i: +// AMDGCNSPIRV-NEXT: [[TMP7:%.*]] = and i8 [[TMP6]], -8 +// AMDGCNSPIRV-NEXT: [[OR_COND_I19_I_I:%.*]] = icmp eq i8 [[TMP7]], 48 +// AMDGCNSPIRV-NEXT: [[MUL_I20_I_I:%.*]] = shl i64 [[__R_0_I16_I_I]], 3 +// AMDGCNSPIRV-NEXT: [[CONV5_I21_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64 +// AMDGCNSPIRV-NEXT: [[ADD_I22_I_I:%.*]] = add i64 [[MUL_I20_I_I]], -48 +// AMDGCNSPIRV-NEXT: [[SUB_I23_I_I:%.*]] = add i64 [[ADD_I22_I_I]], [[CONV5_I21_I_I]] +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_IDX_I_I_I:%.*]] = zext i1 [[OR_COND_I19_I_I]] to i64 +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_I24_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I15_I_I]], i64 [[__TAGP_ADDR_1_IDX_I_I_I]] +// AMDGCNSPIRV-NEXT: [[__R_1_I25_I_I]] = select i1 [[OR_COND_I19_I_I]], i64 [[SUB_I23_I_I]], i64 [[__R_0_I16_I_I]] +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP8]] +// AMDGCNSPIRV: while.cond.i27.i.i: +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_0_I28_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I38_I_I:%.*]], [[WHILE_BODY_I31_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ] +// AMDGCNSPIRV-NEXT: [[__R_0_I29_I_I:%.*]] = phi i64 [ [[__R_1_I39_I_I:%.*]], [[WHILE_BODY_I31_I_I]] ], [ 0, [[ENTRY]] ] +// AMDGCNSPIRV-NEXT: [[TMP8:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I28_I_I]], align 1, !tbaa [[TBAA5]] +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I30_I_I:%.*]] = icmp eq i8 [[TMP8]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I30_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I31_I_I]] +// AMDGCNSPIRV: while.body.i31.i.i: +// AMDGCNSPIRV-NEXT: [[TMP9:%.*]] = add i8 [[TMP8]], -48 +// AMDGCNSPIRV-NEXT: [[OR_COND_I32_I_I:%.*]] = icmp ult i8 [[TMP9]], 10 +// AMDGCNSPIRV-NEXT: [[MUL_I33_I_I:%.*]] = mul i64 [[__R_0_I29_I_I]], 10 +// AMDGCNSPIRV-NEXT: [[CONV5_I34_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64 +// AMDGCNSPIRV-NEXT: [[ADD_I35_I_I:%.*]] = add i64 [[MUL_I33_I_I]], -48 +// AMDGCNSPIRV-NEXT: [[SUB_I36_I_I:%.*]] = add i64 [[ADD_I35_I_I]], [[CONV5_I34_I_I]] +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_IDX_I37_I_I:%.*]] = zext i1 [[OR_COND_I32_I_I]] to i64 +// AMDGCNSPIRV-NEXT: [[__TAGP_ADDR_1_I38_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I28_I_I]], i64 [[__TAGP_ADDR_1_IDX_I37_I_I]] +// AMDGCNSPIRV-NEXT: [[__R_1_I39_I_I]] = select i1 [[OR_COND_I32_I_I]], i64 [[SUB_I36_I_I]], i64 [[__R_0_I29_I_I]] +// AMDGCNSPIRV-NEXT: br i1 [[OR_COND_I32_I_I]], label [[WHILE_COND_I27_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP11]] +// AMDGCNSPIRV: _ZL3nanPKc.exit: +// AMDGCNSPIRV-NEXT: [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_I18_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ], [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[WHILE_BODY_I31_I_I]] ], [ [[__R_0_I29_I_I]], [[WHILE_COND_I27_I_I]] ] +// AMDGCNSPIRV-NEXT: [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247 +// AMDGCNSPIRV-NEXT: [[BF_SET9_I:%.*]] = or disjoint i64 [[BF_VALUE_I]], 9221120237041090560 +// AMDGCNSPIRV-NEXT: [[TMP10:%.*]] = bitcast i64 [[BF_SET9_I]] to double +// AMDGCNSPIRV-NEXT: ret double [[TMP10]] +// extern "C" __device__ double test_nan(const char *tag) { return nan(tag); } @@ -2768,6 +3720,10 @@ extern "C" __device__ double test_nan(const char *tag) { // APPROX-NEXT: entry: // APPROX-NEXT: ret float 0x7FF8000000000000 // +// AMDGCNSPIRV-LABEL: @test_nanf_emptystr( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: ret float 0x7FF8000000000000 +// extern "C" __device__ float test_nanf_emptystr() { return nanf(""); } @@ -2784,6 +3740,10 @@ extern "C" __device__ float test_nanf_emptystr() { // APPROX-NEXT: entry: // APPROX-NEXT: ret double 0x7FF8000000000000 // +// AMDGCNSPIRV-LABEL: @test_nan_emptystr( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: ret double 0x7FF8000000000000 +// extern "C" __device__ double test_nan_emptystr() { return nan(""); } @@ -2800,6 +3760,10 @@ extern "C" __device__ double test_nan_emptystr() { // APPROX-NEXT: entry: // APPROX-NEXT: ret float 0x7FF8000000000000 // +// AMDGCNSPIRV-LABEL: @test_nanf_fill( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: ret float 0x7FF8000000000000 +// extern "C" __device__ float test_nanf_fill() { return nanf("0x456"); } @@ -2816,6 +3780,10 @@ extern "C" __device__ float test_nanf_fill() { // APPROX-NEXT: entry: // APPROX-NEXT: ret double 0x7FF8000000000000 // +// AMDGCNSPIRV-LABEL: @test_nan_fill( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: ret double 0x7FF8000000000000 +// extern "C" __device__ double test_nan_fill() { return nan("0x123"); } @@ -2835,6 +3803,11 @@ extern "C" __device__ double test_nan_fill() { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.nearbyint.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_nearbyintf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.nearbyint.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_nearbyintf(float x) { return nearbyintf(x); } @@ -2854,6 +3827,11 @@ extern "C" __device__ float test_nearbyintf(float x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.nearbyint.f64(double [[X:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_nearbyint( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.nearbyint.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_nearbyint(double x) { return nearbyint(x); } @@ -2873,6 +3851,11 @@ extern "C" __device__ double test_nearbyint(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_nextafterf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_nextafter_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_nextafterf(float x, float y) { return nextafterf(x, y); } @@ -2892,6 +3875,11 @@ extern "C" __device__ float test_nextafterf(float x, float y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_nextafter( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_nextafter_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_nextafter(double x, double y) { return nextafter(x, y); } @@ -2911,6 +3899,11 @@ extern "C" __device__ double test_nextafter(double x, double y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_norm3df( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_len3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_norm3df(float x, float y, float z) { return norm3df(x, y, z); } @@ -2930,6 +3923,11 @@ extern "C" __device__ float test_norm3df(float x, float y, float z) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_norm3d( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_len3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_norm3d(double x, double y, double z) { return norm3d(x, y, z); } @@ -2949,6 +3947,11 @@ extern "C" __device__ double test_norm3d(double x, double y, double z) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_norm4df( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_len4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_norm4df(float x, float y, float z, float w) { return norm4df(x, y, z, w); } @@ -2968,6 +3971,11 @@ extern "C" __device__ float test_norm4df(float x, float y, float z, float w) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_norm4d( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_len4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_norm4d(double x, double y, double z, double w) { return norm4d(x, y, z, w); } @@ -2987,6 +3995,11 @@ extern "C" __device__ double test_norm4d(double x, double y, double z, double w) // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_normcdff( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_ncdf_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_normcdff(float x) { return normcdff(x); } @@ -3006,6 +4019,11 @@ extern "C" __device__ float test_normcdff(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_normcdf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_ncdf_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_normcdf(double x) { return normcdf(x); } @@ -3025,6 +4043,11 @@ extern "C" __device__ double test_normcdf(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_normcdfinvf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_ncdfinv_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_normcdfinvf(float x) { return normcdfinvf(x); } @@ -3044,6 +4067,11 @@ extern "C" __device__ float test_normcdfinvf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_normcdfinv( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_ncdfinv_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_normcdfinv(double x) { return normcdfinv(x); } @@ -3108,6 +4136,26 @@ extern "C" __device__ double test_normcdfinv(double x) { // APPROX-NEXT: [[TMP1:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[__R_0_LCSSA_I]]) // APPROX-NEXT: ret float [[TMP1]] // +// AMDGCNSPIRV-LABEL: @test_normf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL_NOT5_I]], label [[_ZL5NORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]] +// AMDGCNSPIRV: while.body.i: +// AMDGCNSPIRV-NEXT: [[__R_08_I:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +// AMDGCNSPIRV-NEXT: [[__A_ADDR_07_I:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ] +// AMDGCNSPIRV-NEXT: [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ] +// AMDGCNSPIRV-NEXT: [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1 +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[__A_ADDR_07_I]], align 4, !tbaa [[TBAA17]] +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]] +// AMDGCNSPIRV-NEXT: [[ADD_I]] = fadd contract float [[__R_08_I]], [[MUL_I]] +// AMDGCNSPIRV-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__A_ADDR_07_I]], i64 4 +// AMDGCNSPIRV-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]] +// AMDGCNSPIRV: _ZL5normfiPKf.exit: +// AMDGCNSPIRV-NEXT: [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ] +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = tail call contract noundef addrspace(4) float @llvm.sqrt.f32(float [[__R_0_LCSSA_I]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP1]] +// extern "C" __device__ float test_normf(int x, const float *y) { return normf(x, y); } @@ -3172,6 +4220,26 @@ extern "C" __device__ float test_normf(int x, const float *y) { // APPROX-NEXT: [[TMP1:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[__R_0_LCSSA_I]]) // APPROX-NEXT: ret double [[TMP1]] // +// AMDGCNSPIRV-LABEL: @test_norm( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL_NOT5_I]], label [[_ZL4NORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]] +// AMDGCNSPIRV: while.body.i: +// AMDGCNSPIRV-NEXT: [[__R_08_I:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +// AMDGCNSPIRV-NEXT: [[__A_ADDR_07_I:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ] +// AMDGCNSPIRV-NEXT: [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ] +// AMDGCNSPIRV-NEXT: [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1 +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(4) [[__A_ADDR_07_I]], align 8, !tbaa [[TBAA19]] +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]] +// AMDGCNSPIRV-NEXT: [[ADD_I]] = fadd contract double [[__R_08_I]], [[MUL_I]] +// AMDGCNSPIRV-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__A_ADDR_07_I]], i64 8 +// AMDGCNSPIRV-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]] +// AMDGCNSPIRV: _ZL4normiPKd.exit: +// AMDGCNSPIRV-NEXT: [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ] +// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = tail call contract noundef addrspace(4) double @llvm.sqrt.f64(double [[__R_0_LCSSA_I]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP1]] +// extern "C" __device__ double test_norm(int x, const double *y) { return norm(x, y); } @@ -3191,6 +4259,11 @@ extern "C" __device__ double test_norm(int x, const double *y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_powf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_powf(float x, float y) { return powf(x, y); } @@ -3210,6 +4283,11 @@ extern "C" __device__ float test_powf(float x, float y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_pow( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_pow_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_pow(double x, double y) { return pow(x, y); } @@ -3229,6 +4307,11 @@ extern "C" __device__ double test_pow(double x, double y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_powif( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_pown_f32(float noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_powif(float x, int y) { return powif(x, y); } @@ -3248,6 +4331,11 @@ extern "C" __device__ float test_powif(float x, int y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_powi( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_pown_f64(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_powi(double x, int y) { return powi(x, y); } @@ -3267,6 +4355,11 @@ extern "C" __device__ double test_powi(double x, int y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_rcbrtf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rcbrt_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_rcbrtf(float x) { return rcbrtf(x); } @@ -3286,6 +4379,11 @@ extern "C" __device__ float test_rcbrtf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_rcbrt( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rcbrt_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_rcbrt(double x) { return rcbrt(x); } @@ -3305,6 +4403,11 @@ extern "C" __device__ double test_rcbrt(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_remainderf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_remainder_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_remainderf(float x, float y) { return remainderf(x, y); } @@ -3324,6 +4427,11 @@ extern "C" __device__ float test_remainderf(float x, float y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_remainder( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_remainder_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_remainder(double x, double y) { return remainder(x, y); } @@ -3358,6 +4466,17 @@ extern "C" __device__ double test_remainder(double x, double y) { // APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_remquof( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[__TMP_I:%.*]] = alloca i32, align 4 +// AMDGCNSPIRV-NEXT: [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4) +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) float @__ocml_remquo_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[TBAA13]] +// AMDGCNSPIRV-NEXT: store i32 [[TMP0]], ptr addrspace(4) [[Z:%.*]], align 4, !tbaa [[TBAA13]] +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_remquof(float x, float y, int* z) { return remquof(x, y, z); } @@ -3392,6 +4511,17 @@ extern "C" __device__ float test_remquof(float x, float y, int* z) { // APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_remquo( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[__TMP_I:%.*]] = alloca i32, align 4 +// AMDGCNSPIRV-NEXT: [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4) +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) double @__ocml_remquo_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[TBAA13]] +// AMDGCNSPIRV-NEXT: store i32 [[TMP0]], ptr addrspace(4) [[Z:%.*]], align 4, !tbaa [[TBAA13]] +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_remquo(double x, double y, int* z) { return remquo(x, y, z); } @@ -3411,6 +4541,11 @@ extern "C" __device__ double test_remquo(double x, double y, int* z) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_rhypotf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rhypot_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_rhypotf(float x, float y) { return rhypotf(x, y); } @@ -3430,6 +4565,11 @@ extern "C" __device__ float test_rhypotf(float x, float y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_rhypot( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rhypot_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_rhypot(double x, double y) { return rhypot(x, y); } @@ -3449,6 +4589,11 @@ extern "C" __device__ double test_rhypot(double x, double y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.rint.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_rintf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.rint.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_rintf(float x) { return rintf(x); } @@ -3468,6 +4613,11 @@ extern "C" __device__ float test_rintf(float x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.rint.f64(double [[X:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_rint( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.rint.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_rint(double x) { return rint(x); } @@ -3532,6 +4682,26 @@ extern "C" __device__ double test_rint(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_LCSSA_I]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_rnormf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL_NOT5_I]], label [[_ZL6RNORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]] +// AMDGCNSPIRV: while.body.i: +// AMDGCNSPIRV-NEXT: [[__R_08_I:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +// AMDGCNSPIRV-NEXT: [[__A_ADDR_07_I:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ] +// AMDGCNSPIRV-NEXT: [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ] +// AMDGCNSPIRV-NEXT: [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1 +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[__A_ADDR_07_I]], align 4, !tbaa [[TBAA17]] +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]] +// AMDGCNSPIRV-NEXT: [[ADD_I]] = fadd contract float [[__R_08_I]], [[MUL_I]] +// AMDGCNSPIRV-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__A_ADDR_07_I]], i64 4 +// AMDGCNSPIRV-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]] +// AMDGCNSPIRV: _ZL6rnormfiPKf.exit: +// AMDGCNSPIRV-NEXT: [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ] +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rsqrt_f32(float noundef [[__R_0_LCSSA_I]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_rnormf(int x, const float* y) { return rnormf(x, y); } @@ -3596,6 +4766,26 @@ extern "C" __device__ float test_rnormf(int x, const float* y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_LCSSA_I]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_rnorm( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL_NOT5_I]], label [[_ZL5RNORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]] +// AMDGCNSPIRV: while.body.i: +// AMDGCNSPIRV-NEXT: [[__R_08_I:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +// AMDGCNSPIRV-NEXT: [[__A_ADDR_07_I:%.*]] = phi ptr addrspace(4) [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ] +// AMDGCNSPIRV-NEXT: [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ] +// AMDGCNSPIRV-NEXT: [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1 +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(4) [[__A_ADDR_07_I]], align 8, !tbaa [[TBAA19]] +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]] +// AMDGCNSPIRV-NEXT: [[ADD_I]] = fadd contract double [[__R_08_I]], [[MUL_I]] +// AMDGCNSPIRV-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__A_ADDR_07_I]], i64 8 +// AMDGCNSPIRV-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0 +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]] +// AMDGCNSPIRV: _ZL5rnormiPKd.exit: +// AMDGCNSPIRV-NEXT: [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ] +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rsqrt_f64(double noundef [[__R_0_LCSSA_I]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_rnorm(int x, const double* y) { return rnorm(x, y); } @@ -3615,6 +4805,11 @@ extern "C" __device__ double test_rnorm(int x, const double* y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_rnorm3df( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rlen3_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_rnorm3df(float x, float y, float z) { return rnorm3df(x, y, z); } @@ -3634,6 +4829,11 @@ extern "C" __device__ float test_rnorm3df(float x, float y, float z) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_rnorm3d( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rlen3_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_rnorm3d(double x, double y, double z) { return rnorm3d(x, y, z); } @@ -3653,6 +4853,11 @@ extern "C" __device__ double test_rnorm3d(double x, double y, double z) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_rnorm4df( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rlen4_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]], float noundef [[Z:%.*]], float noundef [[W:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_rnorm4df(float x, float y, float z, float w) { return rnorm4df(x, y, z, w); } @@ -3672,6 +4877,11 @@ extern "C" __device__ float test_rnorm4df(float x, float y, float z, float w) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_rnorm4d( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rlen4_f64(double noundef [[X:%.*]], double noundef [[Y:%.*]], double noundef [[Z:%.*]], double noundef [[W:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_rnorm4d(double x, double y, double z, double w) { return rnorm4d(x, y, z, w); } @@ -3691,6 +4901,11 @@ extern "C" __device__ double test_rnorm4d(double x, double y, double z, double w // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.round.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_roundf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.round.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_roundf(float x) { return roundf(x); } @@ -3710,6 +4925,11 @@ extern "C" __device__ float test_roundf(float x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.round.f64(double [[X:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_round( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.round.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_round(double x) { return round(x); } @@ -3729,6 +4949,11 @@ extern "C" __device__ double test_round(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_rsqrtf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_rsqrt_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_rsqrtf(float x) { return rsqrtf(x); } @@ -3748,6 +4973,11 @@ extern "C" __device__ float test_rsqrtf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_rsqrt( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_rsqrt_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_rsqrt(double x) { return rsqrt(x); } @@ -3797,6 +5027,21 @@ extern "C" __device__ double test_rsqrt(double x) { // APPROX-NEXT: [[COND_I:%.*]] = phi contract float [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ] // APPROX-NEXT: ret float [[COND_I]] // +// AMDGCNSPIRV-LABEL: @test_scalblnf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I:%.*]] = icmp eq i64 [[Y:%.*]], 9223372036854775807 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I]], label [[COND_FALSE_I:%.*]], label [[COND_TRUE_I:%.*]] +// AMDGCNSPIRV: cond.true.i: +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = trunc i64 [[Y]] to i32 +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]]) +// AMDGCNSPIRV-NEXT: br label [[_ZL8SCALBLNFFL_EXIT:%.*]] +// AMDGCNSPIRV: cond.false.i: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func addrspace(4) float @__ocml_scalb_f32(float noundef [[X]], float noundef 0x43E0000000000000) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: br label [[_ZL8SCALBLNFFL_EXIT]] +// AMDGCNSPIRV: _ZL8scalblnffl.exit: +// AMDGCNSPIRV-NEXT: [[COND_I:%.*]] = phi contract float [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ] +// AMDGCNSPIRV-NEXT: ret float [[COND_I]] +// extern "C" __device__ float test_scalblnf(float x, long int y) { return scalblnf(x, y); } @@ -3846,6 +5091,21 @@ extern "C" __device__ float test_scalblnf(float x, long int y) { // APPROX-NEXT: [[COND_I:%.*]] = phi contract double [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ] // APPROX-NEXT: ret double [[COND_I]] // +// AMDGCNSPIRV-LABEL: @test_scalbln( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CMP_NOT_I:%.*]] = icmp eq i64 [[Y:%.*]], 9223372036854775807 +// AMDGCNSPIRV-NEXT: br i1 [[CMP_NOT_I]], label [[COND_FALSE_I:%.*]], label [[COND_TRUE_I:%.*]] +// AMDGCNSPIRV: cond.true.i: +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = trunc i64 [[Y]] to i32 +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract addrspace(4) double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]]) +// AMDGCNSPIRV-NEXT: br label [[_ZL7SCALBLNDL_EXIT:%.*]] +// AMDGCNSPIRV: cond.false.i: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func addrspace(4) double @__ocml_scalb_f64(double noundef [[X]], double noundef 0x43E0000000000000) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: br label [[_ZL7SCALBLNDL_EXIT]] +// AMDGCNSPIRV: _ZL7scalblndl.exit: +// AMDGCNSPIRV-NEXT: [[COND_I:%.*]] = phi contract double [ [[TMP0]], [[COND_TRUE_I]] ], [ [[CALL_I]], [[COND_FALSE_I]] ] +// AMDGCNSPIRV-NEXT: ret double [[COND_I]] +// extern "C" __device__ double test_scalbln(double x, long int y) { return scalbln(x, y); } @@ -3865,6 +5125,11 @@ extern "C" __device__ double test_scalbln(double x, long int y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_scalbnf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_scalbnf(float x, int y) { return scalbnf(x, y); } @@ -3884,6 +5149,11 @@ extern "C" __device__ float test_scalbnf(float x, int y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_scalbn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_scalbn(double x, int y) { return scalbn(x, y); } @@ -3894,6 +5164,12 @@ extern "C" __device__ double test_scalbn(double x, int y) { // CHECK-NEXT: [[DOTLOBIT:%.*]] = lshr i32 [[TMP0]], 31 // CHECK-NEXT: ret i32 [[DOTLOBIT]] // +// AMDGCNSPIRV-LABEL: @test___signbitf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = bitcast float [[X:%.*]] to i32 +// AMDGCNSPIRV-NEXT: [[DOTLOBIT:%.*]] = lshr i32 [[TMP0]], 31 +// AMDGCNSPIRV-NEXT: ret i32 [[DOTLOBIT]] +// extern "C" __device__ BOOL_TYPE test___signbitf(float x) { return __signbitf(x); } @@ -3905,6 +5181,13 @@ extern "C" __device__ BOOL_TYPE test___signbitf(float x) { // CHECK-NEXT: [[CONV:%.*]] = trunc nuw nsw i64 [[DOTLOBIT]] to i32 // CHECK-NEXT: ret i32 [[CONV]] // +// AMDGCNSPIRV-LABEL: @test___signbit( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = bitcast double [[X:%.*]] to i64 +// AMDGCNSPIRV-NEXT: [[DOTLOBIT:%.*]] = lshr i64 [[TMP0]], 63 +// AMDGCNSPIRV-NEXT: [[CONV:%.*]] = trunc nuw nsw i64 [[DOTLOBIT]] to i32 +// AMDGCNSPIRV-NEXT: ret i32 [[CONV]] +// extern "C" __device__ BOOL_TYPE test___signbit(double x) { return __signbit(x); } @@ -3942,6 +5225,18 @@ extern "C" __device__ BOOL_TYPE test___signbit(double x) { // APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret void // +// AMDGCNSPIRV-LABEL: @test_sincosf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[__TMP_I:%.*]] = alloca float, align 4 +// AMDGCNSPIRV-NEXT: [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4) +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = call contract spir_func addrspace(4) float @__ocml_sincos_f32(float noundef [[X:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: store float [[CALL_I]], ptr addrspace(4) [[Y:%.*]], align 4, !tbaa [[TBAA17]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[TBAA17]] +// AMDGCNSPIRV-NEXT: store float [[TMP0]], ptr addrspace(4) [[Z:%.*]], align 4, !tbaa [[TBAA17]] +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: ret void +// extern "C" __device__ void test_sincosf(float x, float *y, float *z) { sincosf(x, y, z); } @@ -3979,6 +5274,18 @@ extern "C" __device__ void test_sincosf(float x, float *y, float *z) { // APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret void // +// AMDGCNSPIRV-LABEL: @test_sincos( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[__TMP_I:%.*]] = alloca double, align 8 +// AMDGCNSPIRV-NEXT: [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4) +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = call contract spir_func addrspace(4) double @__ocml_sincos_f64(double noundef [[X:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: store double [[CALL_I]], ptr addrspace(4) [[Y:%.*]], align 8, !tbaa [[TBAA19]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(4) [[__TMP_ASCAST_I]], align 8, !tbaa [[TBAA19]] +// AMDGCNSPIRV-NEXT: store double [[TMP0]], ptr addrspace(4) [[Z:%.*]], align 8, !tbaa [[TBAA19]] +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: ret void +// extern "C" __device__ void test_sincos(double x, double *y, double *z) { sincos(x, y, z); } @@ -4016,6 +5323,18 @@ extern "C" __device__ void test_sincos(double x, double *y, double *z) { // APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret void // +// AMDGCNSPIRV-LABEL: @test_sincospif( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[__TMP_I:%.*]] = alloca float, align 4 +// AMDGCNSPIRV-NEXT: [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4) +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = call contract spir_func addrspace(4) float @__ocml_sincospi_f32(float noundef [[X:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: store float [[CALL_I]], ptr addrspace(4) [[Y:%.*]], align 4, !tbaa [[TBAA17]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[TBAA17]] +// AMDGCNSPIRV-NEXT: store float [[TMP0]], ptr addrspace(4) [[Z:%.*]], align 4, !tbaa [[TBAA17]] +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: ret void +// extern "C" __device__ void test_sincospif(float x, float *y, float *z) { sincospif(x, y, z); } @@ -4053,6 +5372,18 @@ extern "C" __device__ void test_sincospif(float x, float *y, float *z) { // APPROX-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]] // APPROX-NEXT: ret void // +// AMDGCNSPIRV-LABEL: @test_sincospi( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[__TMP_I:%.*]] = alloca double, align 8 +// AMDGCNSPIRV-NEXT: [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4) +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = call contract spir_func addrspace(4) double @__ocml_sincospi_f64(double noundef [[X:%.*]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: store double [[CALL_I]], ptr addrspace(4) [[Y:%.*]], align 8, !tbaa [[TBAA19]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load double, ptr addrspace(4) [[__TMP_ASCAST_I]], align 8, !tbaa [[TBAA19]] +// AMDGCNSPIRV-NEXT: store double [[TMP0]], ptr addrspace(4) [[Z:%.*]], align 8, !tbaa [[TBAA19]] +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[__TMP_I]]) #[[ATTR15]] +// AMDGCNSPIRV-NEXT: ret void +// extern "C" __device__ void test_sincospi(double x, double *y, double *z) { sincospi(x, y, z); } @@ -4072,6 +5403,11 @@ extern "C" __device__ void test_sincospi(double x, double *y, double *z) { // APPROX-NEXT: [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I_I]] // +// AMDGCNSPIRV-LABEL: @test_sinf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_sinf(float x) { return sinf(x); } @@ -4091,6 +5427,11 @@ extern "C" __device__ float test_sinf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_sin( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_sin_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_sin(double x) { return sin(x); } @@ -4110,6 +5451,11 @@ extern "C" __device__ double test_sin(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_sinpif( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_sinpi_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_sinpif(float x) { return sinpif(x); } @@ -4129,6 +5475,11 @@ extern "C" __device__ float test_sinpif(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_sinpi( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_sinpi_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_sinpi(double x) { return sinpi(x); } @@ -4148,6 +5499,11 @@ extern "C" __device__ double test_sinpi(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_sqrtf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.sqrt.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_sqrtf(float x) { return sqrtf(x); } @@ -4167,6 +5523,11 @@ extern "C" __device__ float test_sqrtf(float x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_sqrt( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.sqrt.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_sqrt(double x) { return sqrt(x); } @@ -4186,6 +5547,11 @@ extern "C" __device__ double test_sqrt(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_tanf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_tan_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_tanf(float x) { return tanf(x); } @@ -4205,6 +5571,11 @@ extern "C" __device__ float test_tanf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_tan( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_tan_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_tan(double x) { return tan(x); } @@ -4224,6 +5595,11 @@ extern "C" __device__ double test_tan(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_tanhf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_tanh_f32(float noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_tanhf(float x) { return tanhf(x); } @@ -4243,6 +5619,11 @@ extern "C" __device__ float test_tanhf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_tanh( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_tanh_f64(double noundef [[X:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_tanh(double x) { return tanh(x); } @@ -4262,6 +5643,11 @@ extern "C" __device__ double test_tanh(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_tgammaf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_tgamma_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_tgammaf(float x) { return tgammaf(x); } @@ -4281,6 +5667,11 @@ extern "C" __device__ float test_tgammaf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_tgamma( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_tgamma_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_tgamma(double x) { return tgamma(x); } @@ -4300,6 +5691,11 @@ extern "C" __device__ double test_tgamma(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.trunc.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_truncf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.trunc.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_truncf(float x) { return truncf(x); } @@ -4319,6 +5715,11 @@ extern "C" __device__ float test_truncf(float x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.trunc.f64(double [[X:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_trunc( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.trunc.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_trunc(double x) { return trunc(x); } @@ -4338,6 +5739,11 @@ extern "C" __device__ double test_trunc(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_y0f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y0_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_y0f(float x) { return y0f(x); } @@ -4357,6 +5763,11 @@ extern "C" __device__ float test_y0f(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_y0( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y0_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_y0(double x) { return y0(x); } @@ -4376,6 +5787,11 @@ extern "C" __device__ double test_y0(double x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_y1f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y1_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test_y1f(float x) { return y1f(x); } @@ -4395,6 +5811,11 @@ extern "C" __device__ float test_y1f(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret double [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test_y1( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y1_f64(double noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret double [[CALL_I]] +// extern "C" __device__ double test_y1(double x) { return y1(x); } @@ -4498,6 +5919,39 @@ extern "C" __device__ double test_y1(double x) { // APPROX-NEXT: [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ] // APPROX-NEXT: ret float [[RETVAL_0_I]] // +// AMDGCNSPIRV-LABEL: @test_ynf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [ +// AMDGCNSPIRV-NEXT: i32 0, label [[IF_THEN_I:%.*]] +// AMDGCNSPIRV-NEXT: i32 1, label [[IF_THEN2_I:%.*]] +// AMDGCNSPIRV-NEXT: ] +// AMDGCNSPIRV: if.then.i: +// AMDGCNSPIRV-NEXT: [[CALL_I_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: br label [[_ZL3YNFIF_EXIT:%.*]] +// AMDGCNSPIRV: if.then2.i: +// AMDGCNSPIRV-NEXT: [[CALL_I20_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: br label [[_ZL3YNFIF_EXIT]] +// AMDGCNSPIRV: if.end4.i: +// AMDGCNSPIRV-NEXT: [[CALL_I21_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[CALL_I22_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1 +// AMDGCNSPIRV-NEXT: br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]] +// AMDGCNSPIRV: for.body.i: +// AMDGCNSPIRV-NEXT: [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ] +// AMDGCNSPIRV-NEXT: [[__X1_025_I:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ] +// AMDGCNSPIRV-NEXT: [[__X0_024_I:%.*]] = phi float [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1 +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float +// AMDGCNSPIRV-NEXT: [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]] +// AMDGCNSPIRV-NEXT: [[MUL8_I:%.*]] = fmul contract float [[__X1_025_I]], [[DIV_I]] +// AMDGCNSPIRV-NEXT: [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_024_I]] +// AMDGCNSPIRV-NEXT: [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1 +// AMDGCNSPIRV-NEXT: [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]] +// AMDGCNSPIRV-NEXT: br i1 [[EXITCOND_NOT_I]], label [[_ZL3YNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]] +// AMDGCNSPIRV: _ZL3ynfif.exit: +// AMDGCNSPIRV-NEXT: [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ] +// AMDGCNSPIRV-NEXT: ret float [[RETVAL_0_I]] +// extern "C" __device__ float test_ynf(int x, float y) { return ynf(x, y); } @@ -4601,6 +6055,39 @@ extern "C" __device__ float test_ynf(int x, float y) { // APPROX-NEXT: [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ] // APPROX-NEXT: ret double [[RETVAL_0_I]] // +// AMDGCNSPIRV-LABEL: @test_yn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: switch i32 [[X:%.*]], label [[IF_END4_I:%.*]] [ +// AMDGCNSPIRV-NEXT: i32 0, label [[IF_THEN_I:%.*]] +// AMDGCNSPIRV-NEXT: i32 1, label [[IF_THEN2_I:%.*]] +// AMDGCNSPIRV-NEXT: ] +// AMDGCNSPIRV: if.then.i: +// AMDGCNSPIRV-NEXT: [[CALL_I_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: br label [[_ZL2YNID_EXIT:%.*]] +// AMDGCNSPIRV: if.then2.i: +// AMDGCNSPIRV-NEXT: [[CALL_I20_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: br label [[_ZL2YNID_EXIT]] +// AMDGCNSPIRV: if.end4.i: +// AMDGCNSPIRV-NEXT: [[CALL_I21_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[CALL_I22_I:%.*]] = tail call contract spir_func noundef addrspace(4) double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1 +// AMDGCNSPIRV-NEXT: br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]] +// AMDGCNSPIRV: for.body.i: +// AMDGCNSPIRV-NEXT: [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ] +// AMDGCNSPIRV-NEXT: [[__X1_025_I:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ] +// AMDGCNSPIRV-NEXT: [[__X0_024_I:%.*]] = phi double [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1 +// AMDGCNSPIRV-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double +// AMDGCNSPIRV-NEXT: [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]] +// AMDGCNSPIRV-NEXT: [[MUL8_I:%.*]] = fmul contract double [[__X1_025_I]], [[DIV_I]] +// AMDGCNSPIRV-NEXT: [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_024_I]] +// AMDGCNSPIRV-NEXT: [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1 +// AMDGCNSPIRV-NEXT: [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]] +// AMDGCNSPIRV-NEXT: br i1 [[EXITCOND_NOT_I]], label [[_ZL2YNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP26:![0-9]+]] +// AMDGCNSPIRV: _ZL2ynid.exit: +// AMDGCNSPIRV-NEXT: [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ] +// AMDGCNSPIRV-NEXT: ret double [[RETVAL_0_I]] +// extern "C" __device__ double test_yn(int x, double y) { return yn(x, y); } @@ -4620,6 +6107,11 @@ extern "C" __device__ double test_yn(int x, double y) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test___cosf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test___cosf(float x) { return __cosf(x); } @@ -4642,6 +6134,12 @@ extern "C" __device__ float test___cosf(float x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test___exp10f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x400A934F00000000 +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.amdgcn.exp2.f32(float [[MUL_I]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test___exp10f(float x) { return __exp10f(x); } @@ -4664,6 +6162,12 @@ extern "C" __device__ float test___exp10f(float x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.exp2.f32(float [[MUL_I]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test___expf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = fmul contract float [[X:%.*]], 0x3FF7154760000000 +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.amdgcn.exp2.f32(float [[MUL_I]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test___expf(float x) { return __expf(x); } @@ -4683,6 +6187,11 @@ extern "C" __device__ float test___expf(float x) { // APPROX-NEXT: [[ADD_I:%.*]] = fadd contract float [[X:%.*]], [[Y:%.*]] // APPROX-NEXT: ret float [[ADD_I]] // +// AMDGCNSPIRV-LABEL: @test___fadd_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[ADD_I:%.*]] = fadd contract float [[X:%.*]], [[Y:%.*]] +// AMDGCNSPIRV-NEXT: ret float [[ADD_I]] +// extern "C" __device__ float test___fadd_rn(float x, float y) { return __fadd_rn(x, y); } @@ -4702,6 +6211,11 @@ extern "C" __device__ float test___fadd_rn(float x, float y) { // APPROX-NEXT: [[DIV_I:%.*]] = fdiv contract float [[X:%.*]], [[Y:%.*]] // APPROX-NEXT: ret float [[DIV_I]] // +// AMDGCNSPIRV-LABEL: @test___fdividef( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[DIV_I:%.*]] = fdiv contract float [[X:%.*]], [[Y:%.*]] +// AMDGCNSPIRV-NEXT: ret float [[DIV_I]] +// extern "C" __device__ float test___fdividef(float x, float y) { return __fdividef(x, y); } @@ -4721,6 +6235,11 @@ extern "C" __device__ float test___fdividef(float x, float y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test__fmaf_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test__fmaf_rn(float x, float y, float z) { return __fmaf_rn(x, y, z); } @@ -4740,6 +6259,11 @@ extern "C" __device__ float test__fmaf_rn(float x, float y, float z) { // APPROX-NEXT: [[MUL_I:%.*]] = fmul contract float [[X:%.*]], [[Y:%.*]] // APPROX-NEXT: ret float [[MUL_I]] // +// AMDGCNSPIRV-LABEL: @test___fmul_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = fmul contract float [[X:%.*]], [[Y:%.*]] +// AMDGCNSPIRV-NEXT: ret float [[MUL_I]] +// extern "C" __device__ float test___fmul_rn(float x, float y) { return __fmul_rn(x, y); } @@ -4759,6 +6283,11 @@ extern "C" __device__ float test___fmul_rn(float x, float y) { // APPROX-NEXT: [[DIV_I:%.*]] = fdiv contract float 1.000000e+00, [[X:%.*]] // APPROX-NEXT: ret float [[DIV_I]] // +// AMDGCNSPIRV-LABEL: @test___frcp_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[DIV_I:%.*]] = fdiv contract float 1.000000e+00, [[X:%.*]] +// AMDGCNSPIRV-NEXT: ret float [[DIV_I]] +// extern "C" __device__ float test___frcp_rn(float x) { return __frcp_rn(x); } @@ -4778,6 +6307,11 @@ extern "C" __device__ float test___frcp_rn(float x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.rsq.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test___frsqrt_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.amdgcn.rsq.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test___frsqrt_rn(float x) { return __frsqrt_rn(x); } @@ -4797,6 +6331,11 @@ extern "C" __device__ float test___frsqrt_rn(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR12]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test___fsqrt_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_sqrt_f32(float noundef [[X:%.*]]) #[[ATTR12]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test___fsqrt_rn(float x) { return __fsqrt_rn(x); } @@ -4816,6 +6355,11 @@ extern "C" __device__ float test___fsqrt_rn(float x) { // APPROX-NEXT: [[SUB_I:%.*]] = fsub contract float [[X:%.*]], [[Y:%.*]] // APPROX-NEXT: ret float [[SUB_I]] // +// AMDGCNSPIRV-LABEL: @test___fsub_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[SUB_I:%.*]] = fsub contract float [[X:%.*]], [[Y:%.*]] +// AMDGCNSPIRV-NEXT: ret float [[SUB_I]] +// extern "C" __device__ float test___fsub_rn(float x, float y) { return __fsub_rn(x, y); } @@ -4835,6 +6379,11 @@ extern "C" __device__ float test___fsub_rn(float x, float y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.log10.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test___log10f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.log10.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test___log10f(float x) { return __log10f(x); } @@ -4854,6 +6403,11 @@ extern "C" __device__ float test___log10f(float x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.amdgcn.log.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test___log2f( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.amdgcn.log.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test___log2f(float x) { return __log2f(x); } @@ -4873,6 +6427,11 @@ extern "C" __device__ float test___log2f(float x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.log.f32(float [[X:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test___logf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.log.f32(float [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test___logf(float x) { return __logf(x); } @@ -4892,6 +6451,11 @@ extern "C" __device__ float test___logf(float x) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test___powf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_pow_f32(float noundef [[X:%.*]], float noundef [[Y:%.*]]) #[[ATTR13]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test___powf(float x, float y) { return __powf(x, y); } @@ -4920,6 +6484,14 @@ extern "C" __device__ float test___powf(float x, float y) { // APPROX-NEXT: [[COND5_I:%.*]] = select contract i1 [[CMP_I]], float 0.000000e+00, float [[COND_I]] // APPROX-NEXT: ret float [[COND5_I]] // +// AMDGCNSPIRV-LABEL: @test___saturatef( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CMP_I:%.*]] = fcmp contract olt float [[X:%.*]], 0.000000e+00 +// AMDGCNSPIRV-NEXT: [[CMP1_I:%.*]] = fcmp contract ogt float [[X]], 1.000000e+00 +// AMDGCNSPIRV-NEXT: [[COND_I:%.*]] = select contract i1 [[CMP1_I]], float 1.000000e+00, float [[X]] +// AMDGCNSPIRV-NEXT: [[COND5_I:%.*]] = select contract i1 [[CMP_I]], float 0.000000e+00, float [[COND_I]] +// AMDGCNSPIRV-NEXT: ret float [[COND5_I]] +// extern "C" __device__ float test___saturatef(float x) { return __saturatef(x); } @@ -4948,6 +6520,14 @@ extern "C" __device__ float test___saturatef(float x) { // APPROX-NEXT: store float [[CALL1_I]], ptr [[Z:%.*]], align 4, !tbaa [[TBAA16]] // APPROX-NEXT: ret void // +// AMDGCNSPIRV-LABEL: @test___sincosf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func addrspace(4) float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: store float [[CALL_I]], ptr addrspace(4) [[Y:%.*]], align 4, !tbaa [[TBAA17]] +// AMDGCNSPIRV-NEXT: [[CALL1_I:%.*]] = tail call contract spir_func addrspace(4) float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: store float [[CALL1_I]], ptr addrspace(4) [[Z:%.*]], align 4, !tbaa [[TBAA17]] +// AMDGCNSPIRV-NEXT: ret void +// extern "C" __device__ void test___sincosf(float x, float *y, float *z) { __sincosf(x, y, z); } @@ -4967,6 +6547,11 @@ extern "C" __device__ void test___sincosf(float x, float *y, float *z) { // APPROX-NEXT: [[CALL_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] // APPROX-NEXT: ret float [[CALL_I]] // +// AMDGCNSPIRV-LABEL: @test___sinf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: ret float [[CALL_I]] +// extern "C" __device__ float test___sinf(float x) { return __sinf(x); } @@ -4995,6 +6580,14 @@ extern "C" __device__ float test___sinf(float x) { // APPROX-NEXT: [[MUL_I:%.*]] = fmul contract float [[CALL_I_I]], [[TMP0]] // APPROX-NEXT: ret float [[MUL_I]] // +// AMDGCNSPIRV-LABEL: @test___tanf( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[CALL_I_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[CALL_I3_I:%.*]] = tail call contract spir_func noundef addrspace(4) float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR14]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract addrspace(4) float @llvm.amdgcn.rcp.f32(float [[CALL_I3_I]]) +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = fmul contract float [[CALL_I_I]], [[TMP0]] +// AMDGCNSPIRV-NEXT: ret float [[MUL_I]] +// extern "C" __device__ float test___tanf(float x) { return __tanf(x); } @@ -5014,6 +6607,11 @@ extern "C" __device__ float test___tanf(float x) { // APPROX-NEXT: [[ADD_I:%.*]] = fadd contract double [[X:%.*]], [[Y:%.*]] // APPROX-NEXT: ret double [[ADD_I]] // +// AMDGCNSPIRV-LABEL: @test___dadd_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[ADD_I:%.*]] = fadd contract double [[X:%.*]], [[Y:%.*]] +// AMDGCNSPIRV-NEXT: ret double [[ADD_I]] +// extern "C" __device__ double test___dadd_rn(double x, double y) { return __dadd_rn(x, y); } @@ -5033,6 +6631,11 @@ extern "C" __device__ double test___dadd_rn(double x, double y) { // APPROX-NEXT: [[DIV_I:%.*]] = fdiv contract double [[X:%.*]], [[Y:%.*]] // APPROX-NEXT: ret double [[DIV_I]] // +// AMDGCNSPIRV-LABEL: @test___ddiv_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[DIV_I:%.*]] = fdiv contract double [[X:%.*]], [[Y:%.*]] +// AMDGCNSPIRV-NEXT: ret double [[DIV_I]] +// extern "C" __device__ double test___ddiv_rn(double x, double y) { return __ddiv_rn(x, y); } @@ -5052,6 +6655,11 @@ extern "C" __device__ double test___ddiv_rn(double x, double y) { // APPROX-NEXT: [[MUL_I:%.*]] = fmul contract double [[X:%.*]], [[Y:%.*]] // APPROX-NEXT: ret double [[MUL_I]] // +// AMDGCNSPIRV-LABEL: @test___dmul_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[MUL_I:%.*]] = fmul contract double [[X:%.*]], [[Y:%.*]] +// AMDGCNSPIRV-NEXT: ret double [[MUL_I]] +// extern "C" __device__ double test___dmul_rn(double x, double y) { return __dmul_rn(x, y); } @@ -5071,6 +6679,11 @@ extern "C" __device__ double test___dmul_rn(double x, double y) { // APPROX-NEXT: [[DIV_I:%.*]] = fdiv contract double 1.000000e+00, [[X:%.*]] // APPROX-NEXT: ret double [[DIV_I]] // +// AMDGCNSPIRV-LABEL: @test___drcp_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[DIV_I:%.*]] = fdiv contract double 1.000000e+00, [[X:%.*]] +// AMDGCNSPIRV-NEXT: ret double [[DIV_I]] +// extern "C" __device__ double test___drcp_rn(double x) { return __drcp_rn(x); } @@ -5090,6 +6703,11 @@ extern "C" __device__ double test___drcp_rn(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[X:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test___dsqrt_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.sqrt.f64(double [[X:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test___dsqrt_rn(double x) { return __dsqrt_rn(x); } @@ -5109,6 +6727,11 @@ extern "C" __device__ double test___dsqrt_rn(double x) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test__fma_rn( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test__fma_rn(double x, double y, double z) { return __fma_rn(x, y, z); } @@ -5128,6 +6751,11 @@ extern "C" __device__ double test__fma_rn(double x, double y, double z) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_float_min( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_float_min(float x, float y) { return min(x, y); } @@ -5147,6 +6775,11 @@ extern "C" __device__ float test_float_min(float x, float y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) // APPROX-NEXT: ret float [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_float_max( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret float [[TMP0]] +// extern "C" __device__ float test_float_max(float x, float y) { return max(x, y); } @@ -5166,6 +6799,11 @@ extern "C" __device__ float test_float_max(float x, float y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_double_min( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_double_min(double x, double y) { return min(x, y); } @@ -5185,6 +6823,11 @@ extern "C" __device__ double test_double_min(double x, double y) { // APPROX-NEXT: [[TMP0:%.*]] = tail call contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]]) // APPROX-NEXT: ret double [[TMP0]] // +// AMDGCNSPIRV-LABEL: @test_double_max( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = tail call contract noundef addrspace(4) double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret double [[TMP0]] +// extern "C" __device__ double test_double_max(double x, double y) { return max(x, y); } @@ -5193,6 +6836,11 @@ extern "C" __device__ double test_double_max(double x, double y) { // CHECK-NEXT: [[COND_I:%.*]] = tail call noundef i32 @llvm.smin.i32(i32 [[X:%.*]], i32 [[Y:%.*]]) // CHECK-NEXT: ret i32 [[COND_I]] // +// AMDGCNSPIRV-LABEL: @test_int_min( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[COND_I:%.*]] = tail call noundef addrspace(4) i32 @llvm.smin.i32(i32 [[X:%.*]], i32 [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret i32 [[COND_I]] +// extern "C" __device__ int test_int_min(int x, int y) { return min(x, y); } @@ -5202,6 +6850,11 @@ extern "C" __device__ int test_int_min(int x, int y) { // CHECK-NEXT: [[COND_I:%.*]] = tail call noundef i32 @llvm.smax.i32(i32 [[X:%.*]], i32 [[Y:%.*]]) // CHECK-NEXT: ret i32 [[COND_I]] // +// AMDGCNSPIRV-LABEL: @test_int_max( +// AMDGCNSPIRV-NEXT: entry: +// AMDGCNSPIRV-NEXT: [[COND_I:%.*]] = tail call noundef addrspace(4) i32 @llvm.smax.i32(i32 [[X:%.*]], i32 [[Y:%.*]]) +// AMDGCNSPIRV-NEXT: ret i32 [[COND_I]] +// extern "C" __device__ int test_int_max(int x, int y) { return max(x, y); } diff --git a/clang/test/OpenMP/atomic_messages.cpp b/clang/test/OpenMP/atomic_messages.cpp index d492f6ee1e896..c4e240a0ebb4e 100644 --- a/clang/test/OpenMP/atomic_messages.cpp +++ b/clang/test/OpenMP/atomic_messages.cpp @@ -991,3 +991,34 @@ int mixed() { // expected-note@+1 {{in instantiation of function template specialization 'mixed' requested here}} return mixed(); } + +#ifdef OMP51 +struct U {}; +struct U operator<(U, U); +struct U operator>(U, U); +struct U operator==(U, U); + +template void templated() { + T cx, cv, ce, cd; +#pragma omp atomic compare capture + if (cx == ce) { + cx = cd; + } else { + cv = cx; + } +#pragma omp atomic compare capture + { + cv = cx; + if (ce > cx) { + cx = ce; + } + } +#pragma omp atomic compare capture + { + cv = cx; + if (cx < ce) { + cx = ce; + } + } +} +#endif diff --git a/clang/test/OpenMP/xteam_red_debug_info.c b/clang/test/OpenMP/xteam_red_debug_info.c new file mode 100644 index 0000000000000..68223177d57c4 --- /dev/null +++ b/clang/test/OpenMP/xteam_red_debug_info.c @@ -0,0 +1,16 @@ +// RUN: %clang -g %s -fopenmp --offload-arch=gfx90a -S --offload-host-only -emit-llvm -o - | FileCheck %s + +void test_xteam_red_debug_info() { + int N = 100000; + double c[N]; + double sum = 0.0; + #pragma omp target teams distribute parallel for reduction(+: sum) + for (int i=0; i +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} Function {{.*}} '__builtin_amdgcn_is_invocable' '__amdgpu_feature_predicate_t () noexcept' diff --git a/clang/test/SemaHIP/amdgpu-is-invocable.hip b/clang/test/SemaHIP/amdgpu-is-invocable.hip new file mode 100644 index 0000000000000..214d7769a595f --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-is-invocable.hip @@ -0,0 +1,21 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: spirv-registered-target +// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s + +// expected-no-diagnostics + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ void foo() { + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16)) + return __builtin_trap(); +} + +__global__ void bar() { + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16)) + return __builtin_trap(); +} diff --git a/clang/test/SemaHIP/amdgpu-processor-is.hip b/clang/test/SemaHIP/amdgpu-processor-is.hip new file mode 100644 index 0000000000000..0f7211fd75d90 --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-processor-is.hip @@ -0,0 +1,21 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: spirv-registered-target +// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s + +// expected-no-diagnostics + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ void foo() { + if (__builtin_amdgcn_processor_is("gfx900")) + return __builtin_trap(); +} + +__global__ void bar() { + if (__builtin_amdgcn_processor_is("gfx900")) + return __builtin_trap(); +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl new file mode 100644 index 0000000000000..30ffbfc130a94 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl @@ -0,0 +1,8 @@ +// RUN: %clang_cc1 -triple amdgcn-- -verify -S -o - %s + +void test_builtin_amdgcn_cvt_off_f32_i4(int n) { + struct A{ unsigned x; } a; + __builtin_amdgcn_cvt_off_f32_i4(n, n); // expected-error {{too many arguments to function call, expected 1, have 2}} + __builtin_amdgcn_cvt_off_f32_i4(); // expected-error {{too few arguments to function call, expected 1, have 0}} + __builtin_amdgcn_cvt_off_f32_i4(a); // expected-error {{passing '__private struct A' to parameter of incompatible type 'int'}} +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-lds-err.cl similarity index 60% rename from clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl rename to clang/test/SemaOpenCL/builtins-amdgcn-global-load-lds-err.cl index 7cf80f7c92677..d5185a069b5c4 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-lds-err.cl @@ -1,4 +1,6 @@ -// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -verify=gfx940,expected -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -verify=gfx,expected -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -S -verify=gfx,expected -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -verify=gfx,expected -o - %s // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s // REQUIRES: amdgpu-registered-target @@ -8,12 +10,12 @@ void test_global_load_lds_unsupported_size(global u32* src, local u32 *dst, u32 __builtin_amdgcn_global_load_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, offset, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, /*offset=*/0, aux); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx940-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx940-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} } __attribute__((target("gfx950-insts"))) diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl new file mode 100644 index 0000000000000..5915393ae7f56 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -S -verify=gfx90a,expected -o - %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s +// REQUIRES: amdgpu-registered-target + +void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int offset, int soffset, int x) { + __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, x, offset, soffset, 0, 0); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}} + __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, x, 0); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}} + __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, x); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}} + __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 3, offset, soffset, 0, 0); //expected-error{{invalid size value}} gfx950-note{{size must be 1, 2, 4, 12 or 16}} gfx90a-note{{size must be 1, 2, or 4}} +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl new file mode 100644 index 0000000000000..768f894e9180d --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -S -verify -o - %s +// REQUIRES: amdgpu-registered-target + +void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int offset, int soffset, int x) { + __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, 0); //expected-error{{needs target feature vmem-to-lds-load-insts}} +} diff --git a/clang/test/SemaTemplate/cwg2398.cpp b/clang/test/SemaTemplate/cwg2398.cpp index dccb17c48d325..0be5054d9132b 100644 --- a/clang/test/SemaTemplate/cwg2398.cpp +++ b/clang/test/SemaTemplate/cwg2398.cpp @@ -638,26 +638,19 @@ namespace nttp_auto { template struct A; } // namespace t1 namespace t2 { - // FIXME: Shouldn't accept parameters after a parameter pack. template class> struct A {}; - // new-error@-1 {{deduced non-type template argument does not have the same type as the corresponding template parameter ('auto' vs 'int')}} - // expected-note@-2 {{previous template template parameter is here}} + // expected-error@-1 {{template parameter pack must be the last template parameter}} + // old-note@-2 {{previous template template parameter is here}} template struct B; - // new-note@-1 {{template parameter is declared here}} - // old-note@-2 {{too few template parameters}} + // old-note@-1 {{too few template parameters}} template struct A; - // new-note@-1 {{different template parameters}} - // old-error@-2 {{different template parameters}} + // old-error@-1 {{different template parameters}} } // namespace t2 namespace t3 { - // FIXME: Shouldn't accept parameters after a parameter pack. template class> struct A {}; - // new-error@-1 {{deduced non-type template argument does not have the same type as the corresponding template parameter ('auto' vs 'int')}} - // new-note@-2 {{previous template template parameter is here}} + // expected-error@-1 {{template parameter pack must be the last template parameter}} template struct B; - // new-note@-1 {{template parameter is declared here}} template struct A; - // new-note@-1 {{different template parameters}} } // namespace t3 } // namespace nttp_auto diff --git a/clang/test/SemaTemplate/temp_arg_template_p0522.cpp b/clang/test/SemaTemplate/temp_arg_template_p0522.cpp index 2e5a36ae6ed08..d8a81bb363112 100644 --- a/clang/test/SemaTemplate/temp_arg_template_p0522.cpp +++ b/clang/test/SemaTemplate/temp_arg_template_p0522.cpp @@ -7,7 +7,8 @@ template typename> struct Ti; // #Ti template typename> struct TPi; // #TPi template typename> struct TiPi; -template typename> struct TPiPi; // FIXME: Why is this not ill-formed? +template typename> struct TPiPi; +// expected-error@-1 {{template parameter pack must be the last template parameter}} template typename> struct tT0; // #tT0 template typename> struct Tt0; // #Tt0 diff --git a/clang/tools/amdllvm/CMakeLists.txt b/clang/tools/amdllvm/CMakeLists.txt index 7b14f2ba33ec4..346f7b70d3a1a 100644 --- a/clang/tools/amdllvm/CMakeLists.txt +++ b/clang/tools/amdllvm/CMakeLists.txt @@ -11,7 +11,7 @@ option(CLANG_LINK_FLANG "Create flang install link to clang" ON) list(APPEND CLANG_LINKS_TO_CREATE clang clang++ clang-cl clang-cpp clang-${CLANG_VERSION_MAJOR} lld) if(CLANG_LINK_FLANG) - list(APPEND CLANG_LINKS_TO_CREATE flang flang-new flang-legacy) + list(APPEND CLANG_LINKS_TO_CREATE flang flang-new) endif() foreach(link ${CLANG_LINKS_TO_CREATE}) diff --git a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp index 14c584064e311..49412aeeba7fc 100644 --- a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp +++ b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp @@ -355,6 +355,15 @@ int main(int argc, const char **argv) { errc::invalid_argument, "Duplicate targets are not allowed")); } + if (!checkOffloadBundleID(Target)) { + return reportError(createStringError( + errc::invalid_argument, + "Targets need to follow the format '-', " + "where '' follows the format " + "'----[-[:target " + "features]]'.")); + } + auto OffloadInfo = OffloadTargetInfo(Target, BundlerConfig); bool KindIsValid = OffloadInfo.isOffloadKindValid(); bool TripleIsValid = OffloadInfo.isTripleValid(); diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt index 749f1a51e6405..9bf8b28398792 100644 --- a/clang/tools/driver/CMakeLists.txt +++ b/clang/tools/driver/CMakeLists.txt @@ -104,7 +104,7 @@ foreach(link ${CLANG_LINKS_TO_CREATE} ${HLSL_LINK}) endif() endforeach() install(SCRIPT ${INSTALL_SYMLINK} - CODE "install_symlink(flang flang-classic bin create_symlink)" + CODE "install_symlink(flang flang-new bin create_symlink)" COMPONENT ${component}) else() add_clang_symlink(${link} clang) diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp index eca96dd095f72..44ee73ad3abdc 100644 --- a/clang/tools/driver/driver.cpp +++ b/clang/tools/driver/driver.cpp @@ -316,7 +316,7 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) { if (const char *OverrideStr = ::getenv("CCC_OVERRIDE_OPTIONS")) { // FIXME: Driver shouldn't take extra initial argument. driver::applyOverrideOptions(Args, OverrideStr, SavedStrings, - &llvm::errs()); + "CCC_OVERRIDE_OPTIONS", &llvm::errs()); } std::string Path = GetExecutablePath(ToolContext.Path, CanonicalPrefixes); diff --git a/clang/unittests/AST/DeclPrinterTest.cpp b/clang/unittests/AST/DeclPrinterTest.cpp index 6945dff537cae..124b1a166cb18 100644 --- a/clang/unittests/AST/DeclPrinterTest.cpp +++ b/clang/unittests/AST/DeclPrinterTest.cpp @@ -1196,21 +1196,21 @@ TEST(DeclPrinter, TestUnnamedTemplateParameters) { } TEST(DeclPrinter, TestUnnamedTemplateParametersPacks) { - ASSERT_TRUE(PrintedDeclCXX17Matches( - "template class ...> void A();", - functionTemplateDecl(hasName("A")).bind("id"), - "template class ...> void A()")); + ASSERT_TRUE( + PrintedDeclCXX17Matches("template class ...> void A();", + functionTemplateDecl(hasName("A")).bind("id"), + "template class ...> void A()")); } TEST(DeclPrinter, TestNamedTemplateParametersPacks) { ASSERT_TRUE(PrintedDeclCXX17Matches( "template class ...Z> void A();", + " template class ...Z> void A();", functionTemplateDecl(hasName("A")).bind("id"), "template class ...Z> void A()")); + " template class ...Z> void A()")); } TEST(DeclPrinter, TestTemplateTemplateParameterWrittenWithTypename) { diff --git a/compiler-rt/lib/asan/asan_allocator.cpp b/compiler-rt/lib/asan/asan_allocator.cpp index 3712052507946..fb64107dfae18 100644 --- a/compiler-rt/lib/asan/asan_allocator.cpp +++ b/compiler-rt/lib/asan/asan_allocator.cpp @@ -1294,6 +1294,9 @@ DECLARE_REAL(hsa_status_t, hsa_amd_ipc_memory_attach, const hsa_amd_ipc_memory_t *handle, size_t len, uint32_t num_agents, const hsa_agent_t *mapping_agents, void **mapped_ptr) DECLARE_REAL(hsa_status_t, hsa_amd_ipc_memory_detach, void *mapped_ptr) +DECLARE_REAL(hsa_status_t, hsa_amd_vmem_address_reserve_align, void** ptr, + size_t size, uint64_t address, uint64_t alignment, uint64_t flags) +DECLARE_REAL(hsa_status_t, hsa_amd_vmem_address_free, void* ptr, size_t size); namespace __asan { @@ -1322,9 +1325,8 @@ hsa_status_t asan_hsa_amd_memory_pool_free( if (p) { instance.Deallocate(ptr, 0, 0, stack, FROM_MALLOC); return HSA_STATUS_SUCCESS; - } else { - return REAL(hsa_amd_memory_pool_free)(ptr); } + return REAL(hsa_amd_memory_pool_free)(ptr); } hsa_status_t asan_hsa_amd_agents_allow_access( @@ -1332,11 +1334,8 @@ hsa_status_t asan_hsa_amd_agents_allow_access( const void *ptr, BufferedStackTrace *stack) { void *p = get_allocator().GetBlockBegin(ptr); - if (p) { - return REAL(hsa_amd_agents_allow_access)(num_agents, agents, flags, p); - } else { - return REAL(hsa_amd_agents_allow_access)(num_agents, agents, flags, ptr); - } + return REAL(hsa_amd_agents_allow_access)(num_agents, agents, flags, + p ? p : ptr); } // For asan allocator, kMetadataSize is 0 and maximum redzone size is 2048. This @@ -1384,5 +1383,59 @@ hsa_status_t asan_hsa_amd_ipc_memory_detach(void *mapped_ptr) { reinterpret_cast(reinterpret_cast(mapped_ptr) - kPageSize_); return REAL(hsa_amd_ipc_memory_detach)(mapped_ptr_); } + +hsa_status_t asan_hsa_amd_vmem_address_reserve_align( + void** ptr, size_t size, uint64_t address, uint64_t alignment, + uint64_t flags, BufferedStackTrace* stack) { + // Bypass the tracking for a fixed address since it cannot be supported. + // Reasons: + // 1. Address may not meet the alignment/page-size requirement. + // 2. Requested range overlaps an existing reserved/mapped range. + // 3. Insufficient VA space to honor that exact placement. + if (address) + return REAL(hsa_amd_vmem_address_reserve_align)(ptr, size, address, + alignment, flags); + + if (alignment < kPageSize_) + alignment = kPageSize_; + + if (UNLIKELY(!IsPowerOfTwo(alignment))) { + errno = errno_EINVAL; + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + AmdgpuAllocationInfo aa_info; + aa_info.alloc_func = + reinterpret_cast(asan_hsa_amd_vmem_address_reserve_align); + aa_info.memory_pool = {0}; + aa_info.size = size; + aa_info.flags64 = flags; + aa_info.address = 0; + aa_info.alignment = alignment; + aa_info.ptr = nullptr; + SetErrnoOnNull(*ptr = instance.Allocate(size, alignment, stack, FROM_MALLOC, + false, &aa_info)); + + return aa_info.status; +} + +hsa_status_t asan_hsa_amd_vmem_address_free(void* ptr, size_t size, + BufferedStackTrace* stack) { + if (UNLIKELY(!IsAligned(reinterpret_cast(ptr), kPageSize_))) { + errno = errno_EINVAL; + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + if (size == 0) { + errno = errno_EINVAL; + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + void* p = get_allocator().GetBlockBegin(ptr); + if (p) { + instance.Deallocate(ptr, 0, 0, stack, FROM_MALLOC); + return HSA_STATUS_SUCCESS; + } + return REAL(hsa_amd_vmem_address_free)(ptr, size); +} } // namespace __asan #endif diff --git a/compiler-rt/lib/asan/asan_allocator.h b/compiler-rt/lib/asan/asan_allocator.h index 92f80de7144a5..9cee8662d3048 100644 --- a/compiler-rt/lib/asan/asan_allocator.h +++ b/compiler-rt/lib/asan/asan_allocator.h @@ -319,6 +319,13 @@ hsa_status_t asan_hsa_amd_ipc_memory_attach( const hsa_agent_t* mapping_agents, void** mapped_ptr); hsa_status_t asan_hsa_amd_ipc_memory_detach( void* mapped_ptr); +hsa_status_t asan_hsa_amd_vmem_address_reserve_align(void** ptr, size_t size, + uint64_t address, + uint64_t alignment, + uint64_t flags, + BufferedStackTrace* stack); +hsa_status_t asan_hsa_amd_vmem_address_free(void* ptr, size_t size, + BufferedStackTrace* stack); } // namespace __asan #endif diff --git a/compiler-rt/lib/asan/asan_errors.cpp b/compiler-rt/lib/asan/asan_errors.cpp index e08af718bd7cb..e7c44c105d35a 100644 --- a/compiler-rt/lib/asan/asan_errors.cpp +++ b/compiler-rt/lib/asan/asan_errors.cpp @@ -649,11 +649,11 @@ void ErrorNonSelfGeneric::Print() { Decorator d; Printf("%s", d.Error()); Report("ERROR: AddressSanitizer: %s on address %p at pc %p\n", bug_descr, - (void *)addresses[0], callstack[0]); + (void *)addresses[0], (void *)callstack[0]); Printf("%s%s of size %zu at %p thread id %zu\n", d.Access(), access_size ? (is_write ? "WRITE" : "READ") : "ACCESS", access_size, - (void *)addresses[0], thread_id[0]); + (void *)addresses[0], (usize)thread_id[0]); // todo: perform symbolization for the given callstack // can be done by creating in-memory object file or by writing @@ -691,7 +691,7 @@ ErrorNonSelfAMDGPU::ErrorNonSelfAMDGPU(uptr *dev_callstack, u32 n_callstack, void ErrorNonSelfAMDGPU::PrintStack() { InternalScopedString source_location; - source_location.AppendF(" #0 %p", callstack[0]); + source_location.AppendF(" #0 %p", (void *)callstack[0]); #if SANITIZER_AMDGPU source_location.Append(" in "); __sanitizer::AMDGPUCodeObjectSymbolizer symbolizer; @@ -712,7 +712,8 @@ void ErrorNonSelfAMDGPU::PrintThreadsAndAddresses() { str.Append("\n"); per_row_count = 0; } - str.AppendF("%02d : %p ", workitem_ids[idx], device_address[idx]); + str.AppendF("%02d : %p ", (int)workitem_ids[idx], + (void *)device_address[idx]); per_row_count++; } str.Append("\n"); @@ -737,14 +738,14 @@ static uptr ScanForMagicUp(uptr start, uptr hi, uptr magic0, uptr magic1) { void ErrorNonSelfAMDGPU::PrintMallocStack() { // Facts about asan malloc on device - const uptr magic = 0xfedcba1ee1abcdefULL; + const uptr magic = static_cast(0xfedcba1ee1abcdefULL); const uptr offset = 32; const uptr min_chunk_size = 96; const uptr min_alloc_size = 48; Decorator d; HeapAddressDescription addr_description; - + if (GetHeapAddressInformation(device_address[0], access_size, &addr_description) && addr_description.chunk_access.chunk_size >= min_chunk_size) { @@ -755,11 +756,12 @@ void ErrorNonSelfAMDGPU::PrintMallocStack() { uptr plo = ScanForMagicDown(start, lo, magic, lo); if (plo) { callstack[0] = ((uptr*)plo)[2]; - Printf("%s%p is %u bytes above an address from a %sdevice malloc " - "(or free) call of size %u from%s\n", - d.Location(), device_address[0], - (int)(device_address[0] - (plo+offset)), - d.Allocation(), ((int*)plo)[7], d.Default()); + Printf( + "%s%p is %u bytes above an address from a %sdevice malloc " + "(or free) call of size %u from%s\n", + d.Location(), (void *)device_address[0], + (u32)(device_address[0] - (plo + offset)), d.Allocation(), + ((u32*)plo)[7], d.Default()); // TODO: The code object with the malloc call may not be the same // code object trying the illegal access. A mechanism is needed // to obtain the former. @@ -769,12 +771,13 @@ void ErrorNonSelfAMDGPU::PrintMallocStack() { uptr phi = ScanForMagicUp(start, hi, magic, lo); if (phi) { callstack[0] = ((uptr*)phi)[2]; - Printf("%s%p is %u bytes below an address from a %sdevice malloc " - "(or free) call of size %u from%s\n", - d.Location(), device_address[0], - (int)((phi+offset) - device_address[0]), + Printf( + "%s%p is %u bytes below an address from a %sdevice malloc " + "(or free) call of size %u from%s\n", + d.Location(), (void *)device_address[0], + (u32)((phi + offset) - device_address[0]), - d.Allocation(), ((int*)phi)[7], d.Default()); + d.Allocation(), ((u32*)phi)[7], d.Default()); PrintStack(); } } @@ -783,10 +786,11 @@ void ErrorNonSelfAMDGPU::PrintMallocStack() { void ErrorNonSelfAMDGPU::Print() { Decorator d; Printf("%s", d.Error()); - Report("ERROR: AddressSanitizer: %s on amdgpu device %zu at pc %p\n", - bug_descr, device_id, callstack[0]); - Printf("%s%s of size %zu in workgroup id (%zu,%zu,%zu)\n", d.Access(), - (is_write ? "WRITE" : "READ"), access_size, wg.idx, wg.idy, wg.idz); + Report("ERROR: AddressSanitizer: %s on amdgpu device %d at pc %p\n", + bug_descr, device_id, (void *)callstack[0]); + Printf("%s%s of size %zu in workgroup id (%llu,%llu,%llu)\n", d.Access(), + (is_write ? "WRITE" : "READ"), access_size, wg.idx, + wg.idy, wg.idz); Printf("%s", d.Default()); PrintStack(); Printf("%s", d.Location()); diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp index 5b562c498c48e..f2c41d67cdd9d 100644 --- a/compiler-rt/lib/asan/asan_interceptors.cpp +++ b/compiler-rt/lib/asan/asan_interceptors.cpp @@ -897,6 +897,22 @@ INTERCEPTOR(hsa_status_t, hsa_amd_ipc_memory_detach, void* mapped_ptr) { return asan_hsa_amd_ipc_memory_detach(mapped_ptr); } +INTERCEPTOR(hsa_status_t, hsa_amd_vmem_address_reserve_align, void** ptr, + size_t size, uint64_t address, uint64_t alignment, uint64_t flags) { + AsanInitFromRtl(); + ENSURE_HSA_INITED(); + GET_STACK_TRACE_MALLOC; + return asan_hsa_amd_vmem_address_reserve_align(ptr, size, address, alignment, + flags, &stack); +} + +INTERCEPTOR(hsa_status_t, hsa_amd_vmem_address_free, void* ptr, size_t size) { + AsanInitFromRtl(); + ENSURE_HSA_INITED(); + GET_STACK_TRACE_FREE; + return asan_hsa_amd_vmem_address_free(ptr, size, &stack); +} + void InitializeAmdgpuInterceptors() { ASAN_INTERCEPT_FUNC(hsa_memory_copy); ASAN_INTERCEPT_FUNC(hsa_amd_memory_pool_allocate); @@ -909,6 +925,8 @@ void InitializeAmdgpuInterceptors() { ASAN_INTERCEPT_FUNC(hsa_amd_ipc_memory_create); ASAN_INTERCEPT_FUNC(hsa_amd_ipc_memory_attach); ASAN_INTERCEPT_FUNC(hsa_amd_ipc_memory_detach); + ASAN_INTERCEPT_FUNC(hsa_amd_vmem_address_reserve_align); + ASAN_INTERCEPT_FUNC(hsa_amd_vmem_address_free); } void ENSURE_HSA_INITED() { diff --git a/compiler-rt/lib/dfsan/dfsan.cpp b/compiler-rt/lib/dfsan/dfsan.cpp index 886e93e5fa813..d09a9a70fd83b 100644 --- a/compiler-rt/lib/dfsan/dfsan.cpp +++ b/compiler-rt/lib/dfsan/dfsan.cpp @@ -792,7 +792,7 @@ static void PrintNoOriginTrackingWarning() { static void PrintNoTaintWarning(const void *address) { Decorator d; - Printf(" %sDFSan: no tainted value at %x%s\n", d.Warning(), address, + Printf(" %sDFSan: no tainted value at %zx%s\n", d.Warning(), (uptr)address, d.Default()); } diff --git a/compiler-rt/lib/hwasan/hwasan.cpp b/compiler-rt/lib/hwasan/hwasan.cpp index 24384d8b4d2cf..615bae4b3a3fc 100644 --- a/compiler-rt/lib/hwasan/hwasan.cpp +++ b/compiler-rt/lib/hwasan/hwasan.cpp @@ -176,7 +176,7 @@ static void HwasanFormatMemoryUsage(InternalScopedString &s) { "HWASAN pid: %d rss: %zd threads: %zd stacks: %zd" " thr_aux: %zd stack_depot: %zd uniq_stacks: %zd" " heap: %zd", - internal_getpid(), GetRSS(), thread_stats.n_live_threads, + (int)internal_getpid(), GetRSS(), thread_stats.n_live_threads, thread_stats.total_stack_size, thread_stats.n_live_threads * thread_list.MemoryUsedPerThread(), sds.allocated, sds.n_uniq_ids, asc[AllocatorStatMapped]); @@ -692,7 +692,7 @@ void __hwasan_handle_longjmp(const void *sp_dst) { "WARNING: HWASan is ignoring requested __hwasan_handle_longjmp: " "stack top: %p; target %p; distance: %p (%zd)\n" "False positive error reports may follow\n", - (void *)sp, (void *)dst, dst - sp, dst - sp); + (void *)sp, (void *)dst, (void *)(dst - sp), dst - sp); return; } TagMemory(sp, dst - sp, 0); diff --git a/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h b/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h index 7d134e8c4b7fa..52a28438f3a9b 100644 --- a/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h +++ b/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h @@ -41,7 +41,7 @@ static inline bool malloc_bisect(StackTrace *stack, uptr orig_size) { if (h < left || h > right) return false; if (flags()->malloc_bisect_dump) { - Printf("[alloc] %u %zu\n", h, orig_size); + Printf("[alloc] %u %zu\n", (u32)h, orig_size); stack->Print(); } return true; diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp index bc66e6e805c91..6eafcf9163afa 100644 --- a/compiler-rt/lib/hwasan/hwasan_report.cpp +++ b/compiler-rt/lib/hwasan/hwasan_report.cpp @@ -306,8 +306,9 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa, "%p is located %zd bytes %s a %zd-byte local variable %s " "[%p,%p) " "in %s %s\n", - untagged_addr, offset, whence, local.size, local.name, best_beg, - best_beg + local.size, local.function_name, location.data()); + (void *)untagged_addr, offset, whence, local.size, local.name, + (void *)best_beg, (void *)(best_beg + local.size), + local.function_name, location.data()); location.clear(); Printf("%s\n", d.Default()); } @@ -738,8 +739,8 @@ void BaseReport::PrintHeapOrGlobalCandidate() const { Printf("%s", d.Location()); Printf("%p is located %zd bytes %s a %zd-byte region [%p,%p)\n", untagged_addr, offset, whence, - candidate.heap.end - candidate.heap.begin, candidate.heap.begin, - candidate.heap.end); + candidate.heap.end - candidate.heap.begin, + (void *)candidate.heap.begin, (void *)candidate.heap.end); Printf("%s", d.Allocation()); Printf("allocated by thread T%u here:\n", candidate.heap.thread_id); Printf("%s", d.Default()); @@ -762,11 +763,11 @@ void BaseReport::PrintHeapOrGlobalCandidate() const { Printf( "%p is located %zd bytes %s a %zd-byte global variable " "%s [%p,%p) in %s\n", - untagged_addr, + (void *)untagged_addr, candidate.after ? untagged_addr - (info.start + info.size) : info.start - untagged_addr, candidate.after ? "after" : "before", info.size, info.name, - info.start, info.start + info.size, module_name); + (void *)info.start, (void *)(info.start + info.size), module_name); } else { uptr size = GetGlobalSizeFromDescriptor(candidate.untagged_addr); if (size == 0) @@ -774,14 +775,14 @@ void BaseReport::PrintHeapOrGlobalCandidate() const { Printf( "%p is located %s a global variable in " "\n #0 0x%x (%s+0x%x)\n", - untagged_addr, candidate.after ? "after" : "before", - candidate.untagged_addr, module_name, module_address); + (void *)untagged_addr, candidate.after ? "after" : "before", + (void *)candidate.untagged_addr, module_name, (u32)module_address); else Printf( "%p is located %s a %zd-byte global variable in " "\n #0 0x%x (%s+0x%x)\n", - untagged_addr, candidate.after ? "after" : "before", size, - candidate.untagged_addr, module_name, module_address); + (void *)untagged_addr, candidate.after ? "after" : "before", size, + (void *)candidate.untagged_addr, module_name, (u32)module_address); } Printf("%s", d.Default()); } @@ -792,8 +793,8 @@ void BaseReport::PrintAddressDescription() const { int num_descriptions_printed = 0; if (MemIsShadow(untagged_addr)) { - Printf("%s%p is HWAsan shadow memory.\n%s", d.Location(), untagged_addr, - d.Default()); + Printf("%s%p is HWAsan shadow memory.\n%s", d.Location(), + (void *)untagged_addr, d.Default()); return; } @@ -802,7 +803,7 @@ void BaseReport::PrintAddressDescription() const { Printf( "%s[%p,%p) is a %s %s heap chunk; " "size: %zd offset: %zd\n%s", - d.Location(), heap.begin, heap.begin + heap.size, + d.Location(), (void *)heap.begin, (void *)(heap.begin + heap.size), heap.from_small_heap ? "small" : "large", heap.is_allocated ? "allocated" : "unallocated", heap.size, untagged_addr - heap.begin, d.Default()); @@ -821,8 +822,8 @@ void BaseReport::PrintAddressDescription() const { Printf("%s", d.Error()); Printf("\nCause: stack tag-mismatch\n"); Printf("%s", d.Location()); - Printf("Address %p is located in stack of thread T%zd\n", untagged_addr, - sa.thread_id()); + Printf("Address %p is located in stack of thread T%zd\n", + (void *)untagged_addr, (ssize)sa.thread_id()); Printf("%s", d.Default()); announce_by_id(sa.thread_id()); PrintStackAllocations(sa.get(), ptr_tag, untagged_addr); @@ -842,9 +843,9 @@ void BaseReport::PrintAddressDescription() const { Printf("\nCause: use-after-free\n"); Printf("%s", d.Location()); Printf("%p is located %zd bytes inside a %zd-byte region [%p,%p)\n", - untagged_addr, untagged_addr - UntagAddr(har.tagged_addr), - har.requested_size, UntagAddr(har.tagged_addr), - UntagAddr(har.tagged_addr) + har.requested_size); + (void *)untagged_addr, untagged_addr - UntagAddr(har.tagged_addr), + (ssize)har.requested_size, UntagAddr(har.tagged_addr), + (void *)(UntagAddr(har.tagged_addr) + har.requested_size)); Printf("%s", d.Allocation()); Printf("freed by thread T%u here:\n", ha.free_thread_id); Printf("%s", d.Default()); @@ -858,7 +859,7 @@ void BaseReport::PrintAddressDescription() const { // Print a developer note: the index of this heap object // in the thread's deallocation ring buffer. Printf("hwasan_dev_note_heap_rb_distance: %zd %zd\n", ha.ring_index + 1, - flags()->heap_history_size); + (ssize)flags()->heap_history_size); Printf("hwasan_dev_note_num_matching_addrs: %zd\n", ha.num_matching_addrs); Printf("hwasan_dev_note_num_matching_addrs_4b: %zd\n", ha.num_matching_addrs_4b); @@ -915,10 +916,11 @@ InvalidFreeReport::~InvalidFreeReport() { const Thread *thread = GetCurrentThread(); if (thread) { Report("ERROR: %s: %s on address %p at pc %p on thread T%zd\n", - SanitizerToolName, bug_type, untagged_addr, pc, thread->unique_id()); + SanitizerToolName, bug_type, (void *)untagged_addr, (void *)pc, + (ssize)thread->unique_id()); } else { Report("ERROR: %s: %s on address %p at pc %p on unknown thread\n", - SanitizerToolName, bug_type, untagged_addr, pc); + SanitizerToolName, bug_type, (void *)untagged_addr, (void *)pc); } Printf("%s", d.Access()); if (shadow.addr) { @@ -967,7 +969,8 @@ TailOverwrittenReport::~TailOverwrittenReport() { Printf("%s", d.Error()); const char *bug_type = "allocation-tail-overwritten"; Report("ERROR: %s: %s; heap object [%p,%p) of size %zd\n", SanitizerToolName, - bug_type, untagged_addr, untagged_addr + orig_size, orig_size); + bug_type, (void *)untagged_addr, (void *)(untagged_addr + orig_size), + orig_size); Printf("\n%s", d.Default()); Printf( "Stack of invalid access unknown. Issue detected at deallocation " @@ -1037,7 +1040,7 @@ TagMismatchReport::~TagMismatchReport() { uptr pc = GetTopPc(stack); Printf("%s", d.Error()); Report("ERROR: %s: %s on address %p at pc %p\n", SanitizerToolName, bug_type, - untagged_addr, pc); + (void *)untagged_addr, (void *)pc); Thread *t = GetCurrentThread(); @@ -1049,12 +1052,12 @@ TagMismatchReport::~TagMismatchReport() { GetShortTagCopy(MemToShadow(untagged_addr + mismatch_offset)); Printf( "%s of size %zu at %p tags: %02x/%02x(%02x) (ptr/mem) in thread T%zd\n", - is_store ? "WRITE" : "READ", access_size, untagged_addr, ptr_tag, - mem_tag, short_tag, t->unique_id()); + is_store ? "WRITE" : "READ", access_size, (void *)untagged_addr, + ptr_tag, mem_tag, short_tag, (ssize)t->unique_id()); } else { Printf("%s of size %zu at %p tags: %02x/%02x (ptr/mem) in thread T%zd\n", - is_store ? "WRITE" : "READ", access_size, untagged_addr, ptr_tag, - mem_tag, t->unique_id()); + is_store ? "WRITE" : "READ", access_size, (void *)untagged_addr, + ptr_tag, mem_tag, (ssize)t->unique_id()); } if (mismatch_offset) Printf("Invalid access starting at offset %zu\n", mismatch_offset); @@ -1093,7 +1096,7 @@ void ReportTagMismatch(StackTrace *stack, uptr tagged_addr, uptr access_size, // See the frame breakdown defined in __hwasan_tag_mismatch (from // hwasan_tag_mismatch_{aarch64,riscv64}.S). void ReportRegisters(const uptr *frame, uptr pc) { - Printf("\nRegisters where the failure occurred (pc %p):\n", pc); + Printf("\nRegisters where the failure occurred (pc %p):\n", (void *)pc); // We explicitly print a single line (4 registers/line) each iteration to // reduce the amount of logcat error messages printed. Each Printf() will diff --git a/compiler-rt/lib/hwasan/hwasan_thread.cpp b/compiler-rt/lib/hwasan/hwasan_thread.cpp index 8b32e4e760e2f..7e59ee8fc076d 100644 --- a/compiler-rt/lib/hwasan/hwasan_thread.cpp +++ b/compiler-rt/lib/hwasan/hwasan_thread.cpp @@ -120,9 +120,10 @@ void Thread::Destroy() { } void Thread::Print(const char *Prefix) { - Printf("%sT%zd %p stack: [%p,%p) sz: %zd tls: [%p,%p)\n", Prefix, unique_id_, - (void *)this, stack_bottom(), stack_top(), - stack_top() - stack_bottom(), tls_begin(), tls_end()); + Printf("%sT%zd %p stack: [%p,%p) sz: %zd tls: [%p,%p)\n", Prefix, + (ssize)unique_id_, (void *)this, (void *)stack_bottom(), + (void *)stack_top(), stack_top() - stack_bottom(), (void *)tls_begin(), + (void *)tls_end()); } static u32 xorshift(u32 state) { diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp index 7ab9e4ff2ac9f..7dbcab36683d2 100644 --- a/compiler-rt/lib/lsan/lsan_common.cpp +++ b/compiler-rt/lib/lsan/lsan_common.cpp @@ -805,7 +805,7 @@ static bool ReportUnsuspendedThreads( succeded = false; Report( "Running thread %zu was not suspended. False leaks are possible.\n", - os_id); + (usize)os_id); } } return succeded; diff --git a/compiler-rt/lib/memprof/memprof_shadow_setup.cpp b/compiler-rt/lib/memprof/memprof_shadow_setup.cpp index e7832f656ee8e..7712a94fde3d6 100644 --- a/compiler-rt/lib/memprof/memprof_shadow_setup.cpp +++ b/compiler-rt/lib/memprof/memprof_shadow_setup.cpp @@ -29,7 +29,7 @@ static void ProtectGap(uptr addr, uptr size) { Printf("protect_shadow_gap=0:" " not protecting shadow gap, allocating gap's shadow\n" "|| `[%p, %p]` || ShadowGap's shadow ||\n", - GapShadowBeg, GapShadowEnd); + (void *)GapShadowBeg, (void *)GapShadowEnd); ReserveShadowMemoryRange(GapShadowBeg, GapShadowEnd, "unprotected gap shadow"); return; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.cpp index 5fb47c9f9a0b0..cf10cb773e746 100755 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.cpp @@ -22,6 +22,11 @@ struct HsaMemoryFunctions { void *(*alloc)(size_t), uint32_t *num_agents_accessible, hsa_agent_t **accessible); + hsa_status_t (*vmem_address_reserve_align)(void** ptr, size_t size, + uint64_t address, + uint64_t alignment, + uint64_t flags); + hsa_status_t (*vmem_address_free)(void* ptr, size_t size); }; static HsaMemoryFunctions hsa_amd; @@ -37,20 +42,30 @@ bool AmdgpuMemFuncs::Init() { RTLD_NEXT, "hsa_amd_memory_pool_free"); hsa_amd.pointer_info = (decltype(hsa_amd.pointer_info))dlsym( RTLD_NEXT, "hsa_amd_pointer_info"); + hsa_amd.vmem_address_reserve_align = + (decltype(hsa_amd.vmem_address_reserve_align))dlsym( + RTLD_NEXT, "hsa_amd_vmem_address_reserve_align"); + hsa_amd.vmem_address_free = (decltype(hsa_amd.vmem_address_free))dlsym( + RTLD_NEXT, "hsa_amd_vmem_address_free"); if (!hsa_amd.memory_pool_allocate || !hsa_amd.memory_pool_free || - !hsa_amd.pointer_info) + !hsa_amd.pointer_info || !hsa_amd.vmem_address_reserve_align || + !hsa_amd.vmem_address_free) return false; - else - return true; + return true; } void *AmdgpuMemFuncs::Allocate(uptr size, uptr alignment, DeviceAllocationInfo *da_info) { AmdgpuAllocationInfo *aa_info = reinterpret_cast(da_info); - - aa_info->status = hsa_amd.memory_pool_allocate(aa_info->memory_pool, size, - aa_info->flags, &aa_info->ptr); + if (!aa_info->memory_pool.handle) { + aa_info->status = hsa_amd.vmem_address_reserve_align( + &aa_info->ptr, size, aa_info->address, aa_info->alignment, + aa_info->flags64); + } else { + aa_info->status = hsa_amd.memory_pool_allocate( + aa_info->memory_pool, size, aa_info->flags, &aa_info->ptr); + } if (aa_info->status != HSA_STATUS_SUCCESS) return nullptr; @@ -58,10 +73,18 @@ void *AmdgpuMemFuncs::Allocate(uptr size, uptr alignment, } void AmdgpuMemFuncs::Deallocate(void *p) { - UNUSED hsa_status_t status = hsa_amd.memory_pool_free(p); + DevicePointerInfo DevPtrInfo; + if (AmdgpuMemFuncs::GetPointerInfo(reinterpret_cast(p), &DevPtrInfo)) { + if (DevPtrInfo.type == HSA_EXT_POINTER_TYPE_HSA) { + UNUSED hsa_status_t status = hsa_amd.memory_pool_free(p); + } else if (DevPtrInfo.type == HSA_EXT_POINTER_TYPE_RESERVED_ADDR) { + UNUSED hsa_status_t status = + hsa_amd.vmem_address_free(p, DevPtrInfo.map_size); + } + } } -bool AmdgpuMemFuncs::GetPointerInfo(uptr ptr, DevivePointerInfo *ptr_info) { +bool AmdgpuMemFuncs::GetPointerInfo(uptr ptr, DevicePointerInfo* ptr_info) { hsa_amd_pointer_info_t info; info.size = sizeof(hsa_amd_pointer_info_t); hsa_status_t status = @@ -70,8 +93,12 @@ bool AmdgpuMemFuncs::GetPointerInfo(uptr ptr, DevivePointerInfo *ptr_info) { if (status != HSA_STATUS_SUCCESS) return false; - ptr_info->map_beg = reinterpret_cast(info.agentBaseAddress); + if (info.type == HSA_EXT_POINTER_TYPE_RESERVED_ADDR) + ptr_info->map_beg = reinterpret_cast(info.hostBaseAddress); + else if (info.type == HSA_EXT_POINTER_TYPE_HSA) + ptr_info->map_beg = reinterpret_cast(info.agentBaseAddress); ptr_info->map_size = info.sizeInBytes; + ptr_info->type = reinterpret_cast(info.type); return true; } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.h index 634731703aba3..84b62964e5145 100755 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.h @@ -20,7 +20,7 @@ class AmdgpuMemFuncs { static void *Allocate(uptr size, uptr alignment, DeviceAllocationInfo *da_info); static void Deallocate(void *p); - static bool GetPointerInfo(uptr ptr, DevivePointerInfo *ptr_info); + static bool GetPointerInfo(uptr ptr, DevicePointerInfo* ptr_info); static uptr GetPageSize(); }; @@ -32,8 +32,11 @@ struct AmdgpuAllocationInfo : public DeviceAllocationInfo { hsa_status_t status; void *alloc_func; hsa_amd_memory_pool_t memory_pool; - size_t size; - uint32_t flags; + u64 alignment; + u64 address; + u64 flags64; + usize size; + u32 flags; void *ptr; }; #endif // SANITIZER_AMDGPU diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_device.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_device.h index 9feb0549b33b3..f76800da79ac3 100755 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_device.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_device.h @@ -31,7 +31,8 @@ struct DeviceAllocationInfo { DeviceAllocationType type_; }; -struct DevivePointerInfo { +struct DevicePointerInfo { + u64 type; uptr map_beg; uptr map_size; }; @@ -165,7 +166,7 @@ class DeviceAllocatorT { : nullptr; } - void *GetBlockBegin(const void *ptr) const { + void* GetBlockBegin(const void* ptr) const { Header header; if (!mem_funcs_inited_) return nullptr; uptr p = reinterpret_cast(ptr); @@ -182,7 +183,7 @@ class DeviceAllocatorT { if (!nearest_chunk) return nullptr; if (p != nearest_chunk) { - Header *h = GetHeader(nearest_chunk, &header); + Header* h = GetHeader(nearest_chunk, &header); CHECK_GE(nearest_chunk, h->map_beg); CHECK_LT(nearest_chunk, h->map_beg + h->map_size); CHECK_LE(nearest_chunk, p); @@ -297,7 +298,7 @@ class DeviceAllocatorT { return mem_funcs_inited_; } - typedef DevivePointerInfo Header; + typedef DevicePointerInfo Header; Header *GetHeaderAnyPointer(uptr p, Header* h) const { CHECK(IsAligned(p, page_size_)); diff --git a/compiler-rt/lib/xray/xray_init.cpp b/compiler-rt/lib/xray/xray_init.cpp index 020bfe52b5320..9cc6d5fcc4c1d 100644 --- a/compiler-rt/lib/xray/xray_init.cpp +++ b/compiler-rt/lib/xray/xray_init.cpp @@ -105,7 +105,7 @@ __xray_register_sleds(const XRaySledEntry *SledsBegin, } if (Verbosity()) - Report("Registering %d new functions!\n", SledMap.Functions); + Report("Registering %d new functions!\n", (int)SledMap.Functions); { SpinMutexLock Guard(&XRayInstrMapMutex); diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp index 4ec492c266d80..d96f9f2b4d178 100644 --- a/compiler-rt/lib/xray/xray_interface.cpp +++ b/compiler-rt/lib/xray/xray_interface.cpp @@ -306,7 +306,8 @@ XRayPatchingStatus controlPatchingObjectUnchecked(bool Enable, int32_t ObjId) { return XRayPatchingStatus::NOT_INITIALIZED; if (Verbosity()) - Report("Patching object %d with %d functions.\n", ObjId, InstrMap.Entries); + Report("Patching object %d with %d functions.\n", ObjId, + (int)InstrMap.Entries); // Check if the corresponding DSO has been unloaded. if (!InstrMap.Loaded) { diff --git a/flang/EnableFlangBuild b/flang/EnableFlangBuild index e69de29bb2d1d..fb5dbc49bf7ff 100644 --- a/flang/EnableFlangBuild +++ b/flang/EnableFlangBuild @@ -0,0 +1 @@ +DisableClassic diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md index 815c26a28dfdf..1e984be3b8f0a 100644 --- a/flang/docs/FlangDriver.md +++ b/flang/docs/FlangDriver.md @@ -619,3 +619,31 @@ nvfortran defines `-fast` as - `-Mcache_align`: there is no equivalent flag in Flang or Clang. - `-Mflushz`: flush-to-zero mode - when `-ffast-math` is specified, Flang will link to `crtfastmath.o` to ensure denormal numbers are flushed to zero. + + +## FCC_OVERRIDE_OPTIONS + +The environment variable `FCC_OVERRIDE_OPTIONS` can be used to edit flang's +command line arguments. The value of this variable is a space-separated list of +edits to perform. The edits are applied in the order in which they appear in +`FCC_OVERRIDE_OPTIONS`. Each edit should be one of the following form: + +- `#`: Silence information about the changes to the command line arguments. + +- `^FOO`: Add `FOO` as a new argument at the beginning of the command line right + after the name of the compiler executable. + +- `+FOO`: Add `FOO` as a new argument at the end of the command line. + +- `s/XXX/YYY/`: Substitute the regular expression `XXX` with `YYY` in the + command line. + +- `xOPTION`: Removes all instances of the literal argument `OPTION`. + +- `XOPTION`: Removes all instances of the literal argument `OPTION`, and the + following argument. + +- `Ox`: Removes all flags matching `O` or `O[sz0-9]` and adds `Ox` at the end + of the command line. + +This environment variable does not affect the options added by the config files. diff --git a/flang/include/flang/Optimizer/Builder/DirectivesCommon.h b/flang/include/flang/Optimizer/Builder/DirectivesCommon.h index 443b0ee59007f..ee3aa87314099 100644 --- a/flang/include/flang/Optimizer/Builder/DirectivesCommon.h +++ b/flang/include/flang/Optimizer/Builder/DirectivesCommon.h @@ -17,6 +17,8 @@ #ifndef FORTRAN_OPTIMIZER_BUILDER_DIRECTIVESCOMMON_H_ #define FORTRAN_OPTIMIZER_BUILDER_DIRECTIVESCOMMON_H_ +#include "BoxValue.h" +#include "FIRBuilder.h" #include "flang/Optimizer/Builder/BoxValue.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Todo.h" @@ -129,6 +131,31 @@ gatherBoundsOrBoundValues(fir::FirOpBuilder &builder, mlir::Location loc, } return values; } +template +mlir::Value +genBoundsOpFromBoxChar(fir::FirOpBuilder &builder, mlir::Location loc, + fir::ExtendedValue dataExv, AddrAndBoundsInfo &info) { + // TODO: Handle info.isPresent. + if (auto boxCharType = + mlir::dyn_cast(info.addr.getType())) { + mlir::Type idxTy = builder.getIndexType(); + mlir::Type lenType = builder.getCharacterLengthType(); + mlir::Type refType = builder.getRefType(boxCharType.getEleTy()); + auto unboxed = + builder.create(loc, refType, lenType, info.addr); + mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0); + mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); + mlir::Value extent = unboxed.getResult(1); + mlir::Value stride = one; + mlir::Value ub = builder.create(loc, extent, one); + mlir::Type boundTy = builder.getType(); + return builder.create( + loc, boundTy, /*lower_bound=*/zero, + /*upper_bound=*/ub, /*extent=*/extent, /*stride=*/stride, + /*stride_in_bytes=*/true, /*start_idx=*/zero); + } + return mlir::Value{}; +} /// Generate the bounds operation from the descriptor information. template @@ -248,6 +275,10 @@ genImplicitBoundsOps(fir::FirOpBuilder &builder, AddrAndBoundsInfo &info, bounds = genBaseBoundsOps(builder, loc, dataExv, dataExvIsAssumedSize); } + if (characterWithDynamicLen(fir::unwrapRefType(baseOp.getType()))) { + bounds = {genBoundsOpFromBoxChar(builder, loc, + dataExv, info)}; + } return bounds; } diff --git a/flang/include/flang/Semantics/openmp-directive-sets.h b/flang/include/flang/Semantics/openmp-directive-sets.h index 7cdca1214e749..dd610c9702c28 100644 --- a/flang/include/flang/Semantics/openmp-directive-sets.h +++ b/flang/include/flang/Semantics/openmp-directive-sets.h @@ -21,6 +21,8 @@ namespace llvm::omp { //===----------------------------------------------------------------------===// // - topSet: The directive appears alone or as the first in a // compound construct. +// - bottomSet: The directive appears alone or as the last in a +// compound construct. // - allSet: All standalone or compound uses of the directive. static const OmpDirectiveSet topDistributeSet{ @@ -172,6 +174,11 @@ static const OmpDirectiveSet topTeamsSet{ Directive::OMPD_teams_loop, }; +static const OmpDirectiveSet bottomTeamsSet{ + Directive::OMPD_target_teams, + Directive::OMPD_teams, +}; + static const OmpDirectiveSet allTeamsSet{ OmpDirectiveSet{ Directive::OMPD_target_teams, diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 5b5b8e9a92ea8..10b033a83f14d 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -217,27 +217,8 @@ static void bindEntryBlockArgs(lower::AbstractConverter &converter, assert(args.isValid() && "invalid args"); fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - auto bindSingleMapLike = [&converter, - &firOpBuilder](const semantics::Symbol &sym, - const mlir::BlockArgument &arg) { - // Clones the `bounds` placing them inside the entry block and returns - // them. - auto cloneBound = [&](mlir::Value bound) { - if (mlir::isMemoryEffectFree(bound.getDefiningOp())) { - mlir::Operation *clonedOp = firOpBuilder.clone(*bound.getDefiningOp()); - return clonedOp->getResult(0); - } - TODO(converter.getCurrentLocation(), - "target map-like clause operand unsupported bound type"); - }; - - auto cloneBounds = [cloneBound](llvm::ArrayRef bounds) { - llvm::SmallVector clonedBounds; - llvm::transform(bounds, std::back_inserter(clonedBounds), - [&](mlir::Value bound) { return cloneBound(bound); }); - return clonedBounds; - }; - + auto bindSingleMapLike = [&converter](const semantics::Symbol &sym, + const mlir::BlockArgument &arg) { fir::ExtendedValue extVal = converter.getSymbolExtendedValue(sym); auto refType = mlir::dyn_cast(arg.getType()); if (refType && fir::isa_builtin_cptr_type(refType.getElementType())) { @@ -245,31 +226,27 @@ static void bindEntryBlockArgs(lower::AbstractConverter &converter, } else { extVal.match( [&](const fir::BoxValue &v) { - converter.bindSymbol(sym, - fir::BoxValue(arg, cloneBounds(v.getLBounds()), - v.getExplicitParameters(), - v.getExplicitExtents())); + converter.bindSymbol(sym, fir::BoxValue(arg, v.getLBounds(), + v.getExplicitParameters(), + v.getExplicitExtents())); }, [&](const fir::MutableBoxValue &v) { converter.bindSymbol( - sym, fir::MutableBoxValue(arg, cloneBounds(v.getLBounds()), + sym, fir::MutableBoxValue(arg, v.getLBounds(), v.getMutableProperties())); }, [&](const fir::ArrayBoxValue &v) { - converter.bindSymbol( - sym, fir::ArrayBoxValue(arg, cloneBounds(v.getExtents()), - cloneBounds(v.getLBounds()), - v.getSourceBox())); + converter.bindSymbol(sym, fir::ArrayBoxValue(arg, v.getExtents(), + v.getLBounds(), + v.getSourceBox())); }, [&](const fir::CharArrayBoxValue &v) { - converter.bindSymbol( - sym, fir::CharArrayBoxValue(arg, cloneBound(v.getLen()), - cloneBounds(v.getExtents()), - cloneBounds(v.getLBounds()))); + converter.bindSymbol(sym, fir::CharArrayBoxValue(arg, v.getLen(), + v.getExtents(), + v.getLBounds())); }, [&](const fir::CharBoxValue &v) { - converter.bindSymbol( - sym, fir::CharBoxValue(arg, cloneBound(v.getLen()))); + converter.bindSymbol(sym, fir::CharBoxValue(arg, v.getLen())); }, [&](const fir::UnboxedValue &v) { converter.bindSymbol(sym, arg); }, [&](const auto &) { @@ -1338,14 +1315,13 @@ static void genBodyOfTargetOp( while (!valuesDefinedAbove.empty()) { for (mlir::Value val : valuesDefinedAbove) { mlir::Operation *valOp = val.getDefiningOp(); - assert(valOp != nullptr); // NOTE: We skip BoxDimsOp's as the lesser of two evils is to map the // indices separately, as the alternative is to eventually map the Box, // which comes with a fairly large overhead comparatively. We could be // more robust about this and check using a BackwardsSlice to see if we // run the risk of mapping a box. - if (mlir::isMemoryEffectFree(valOp) && + if (valOp && mlir::isMemoryEffectFree(valOp) && !mlir::isa(valOp)) { mlir::Operation *clonedOp = valOp->clone(); entryBlock->push_front(clonedOp); @@ -1358,7 +1334,13 @@ static void genBodyOfTargetOp( valOp->replaceUsesWithIf(clonedOp, replace); } else { auto savedIP = firOpBuilder.getInsertionPoint(); - firOpBuilder.setInsertionPointAfter(valOp); + + if (valOp) + firOpBuilder.setInsertionPointAfter(valOp); + else + // This means val is a block argument + firOpBuilder.setInsertionPoint(targetOp); + auto copyVal = firOpBuilder.createTemporary(val.getLoc(), val.getType()); firOpBuilder.createStoreWithConvert(copyVal.getLoc(), val, copyVal); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index e75a29c968d17..78ac2fe54dbd5 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -1052,8 +1052,10 @@ static constexpr MathOperation mathOperations[] = { {"acos", "cacos", genFuncType, Ty::Complex<8>>, genLibCall}, {"acos", RTNAME_STRING(CAcosF128), FuncTypeComplex16Complex16, genLibF128Call}, - {"acosh", "acoshf", genFuncType, Ty::Real<4>>, genLibCall}, - {"acosh", "acosh", genFuncType, Ty::Real<8>>, genLibCall}, + {"acosh", "acoshf", genFuncType, Ty::Real<4>>, + genMathOp}, + {"acosh", "acosh", genFuncType, Ty::Real<8>>, + genMathOp}, {"acosh", RTNAME_STRING(AcoshF128), FuncTypeReal16Real16, genLibF128Call}, {"acosh", "cacoshf", genFuncType, Ty::Complex<4>>, genLibCall}, @@ -1077,15 +1079,19 @@ static constexpr MathOperation mathOperations[] = { {"anint", "llvm.round.f80", genFuncType, Ty::Real<10>>, genMathOp}, {"anint", RTNAME_STRING(RoundF128), FuncTypeReal16Real16, genLibF128Call}, - {"asin", "asinf", genFuncType, Ty::Real<4>>, genLibCall}, - {"asin", "asin", genFuncType, Ty::Real<8>>, genLibCall}, + {"asin", "asinf", genFuncType, Ty::Real<4>>, + genMathOp}, + {"asin", "asin", genFuncType, Ty::Real<8>>, + genMathOp}, {"asin", RTNAME_STRING(AsinF128), FuncTypeReal16Real16, genLibF128Call}, {"asin", "casinf", genFuncType, Ty::Complex<4>>, genLibCall}, {"asin", "casin", genFuncType, Ty::Complex<8>>, genLibCall}, {"asin", RTNAME_STRING(CAsinF128), FuncTypeComplex16Complex16, genLibF128Call}, - {"asinh", "asinhf", genFuncType, Ty::Real<4>>, genLibCall}, - {"asinh", "asinh", genFuncType, Ty::Real<8>>, genLibCall}, + {"asinh", "asinhf", genFuncType, Ty::Real<4>>, + genMathOp}, + {"asinh", "asinh", genFuncType, Ty::Real<8>>, + genMathOp}, {"asinh", RTNAME_STRING(AsinhF128), FuncTypeReal16Real16, genLibF128Call}, {"asinh", "casinhf", genFuncType, Ty::Complex<4>>, genLibCall}, @@ -1114,8 +1120,10 @@ static constexpr MathOperation mathOperations[] = { genMathOp}, {"atan2", RTNAME_STRING(Atan2F128), FuncTypeReal16Real16Real16, genLibF128Call}, - {"atanh", "atanhf", genFuncType, Ty::Real<4>>, genLibCall}, - {"atanh", "atanh", genFuncType, Ty::Real<8>>, genLibCall}, + {"atanh", "atanhf", genFuncType, Ty::Real<4>>, + genMathOp}, + {"atanh", "atanh", genFuncType, Ty::Real<8>>, + genMathOp}, {"atanh", RTNAME_STRING(AtanhF128), FuncTypeReal16Real16, genLibF128Call}, {"atanh", "catanhf", genFuncType, Ty::Complex<4>>, genLibCall}, diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 9ad176f72486c..7e6a1d9bca72e 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -3197,10 +3197,9 @@ struct LoadOpConversion : public fir::FIROpConversion { llvm::LogicalResult matchAndRewrite(fir::LoadOp load, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const override { - mlir::Type llvmLoadTy = convertObjectType(load.getType()); if (auto boxTy = mlir::dyn_cast(load.getType())) { - // fir.box is a special case because it is considered an ssa value in + // fir.box is a special case because it is considered as an ssa values in // fir, but it is lowered as a pointer to a descriptor. So // fir.ref and fir.box end up being the same llvm types and // loading a fir.ref is implemented as taking a snapshot of the @@ -3224,16 +3223,30 @@ struct LoadOpConversion : public fir::FIROpConversion { newBoxStorage = genAllocaAndAddrCastWithType(loc, llvmLoadTy, defaultAlign, rewriter); - TypePair boxTypePair{boxTy, llvmLoadTy}; - mlir::Value boxSize = - computeBoxSize(loc, boxTypePair, inputBoxStorage, rewriter); - auto memcpy = rewriter.create( - loc, newBoxStorage, inputBoxStorage, boxSize, /*isVolatile=*/false); - - if (std::optional optionalTag = load.getTbaa()) - memcpy.setTBAATags(*optionalTag); - else - attachTBAATag(memcpy, boxTy, boxTy, nullptr); + // TODO: always generate llvm.memcpy, LLVM is better at optimizing it than + // aggregate loads + stores. + if (boxTy.isAssumedRank()) { + + TypePair boxTypePair{boxTy, llvmLoadTy}; + mlir::Value boxSize = + computeBoxSize(loc, boxTypePair, inputBoxStorage, rewriter); + auto memcpy = rewriter.create( + loc, newBoxStorage, inputBoxStorage, boxSize, /*isVolatile=*/false); + if (std::optional optionalTag = load.getTbaa()) + memcpy.setTBAATags(*optionalTag); + else + attachTBAATag(memcpy, boxTy, boxTy, nullptr); + } else { + auto boxValue = rewriter.create(loc, llvmLoadTy, + inputBoxStorage); + if (std::optional optionalTag = load.getTbaa()) + boxValue.setTBAATags(*optionalTag); + else + attachTBAATag(boxValue, boxTy, boxTy, nullptr); + auto storeOp = + rewriter.create(loc, boxValue, newBoxStorage); + attachTBAATag(storeOp, boxTy, boxTy, nullptr); + } rewriter.replaceOp(load, newBoxStorage); } else { auto loadOp = rewriter.create( @@ -3517,13 +3530,20 @@ struct StoreOpConversion : public fir::FIROpConversion { mlir::LLVM::AliasAnalysisOpInterface newOp; if (auto boxTy = mlir::dyn_cast(storeTy)) { mlir::Type llvmBoxTy = lowerTy().convertBoxTypeAsStruct(boxTy); - // Always use memcpy because LLVM is not as effective at optimizing - // aggregate loads/stores as it is optimizing memcpy. - TypePair boxTypePair{boxTy, llvmBoxTy}; - mlir::Value boxSize = - computeBoxSize(loc, boxTypePair, llvmValue, rewriter); - newOp = rewriter.create( - loc, llvmMemref, llvmValue, boxSize, /*isVolatile=*/false); + // fir.box value is actually in memory, load it first before storing it, + // or do a memcopy for assumed-rank descriptors. + if (boxTy.isAssumedRank()) { + TypePair boxTypePair{boxTy, llvmBoxTy}; + mlir::Value boxSize = + computeBoxSize(loc, boxTypePair, llvmValue, rewriter); + newOp = rewriter.create( + loc, llvmMemref, llvmValue, boxSize, /*isVolatile=*/false); + } else { + auto val = + rewriter.create(loc, llvmBoxTy, llvmValue); + attachTBAATag(val, boxTy, boxTy, nullptr); + newOp = rewriter.create(loc, val, llvmMemref); + } } else { newOp = rewriter.create(loc, llvmValue, llvmMemref); } diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp index 50b77583ca4e3..900fbb3b5f351 100644 --- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp +++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp @@ -47,6 +47,8 @@ #include #include +#define DEBUG_TYPE "omp-map-info-finalization" +#define PDBGS() (llvm::dbgs() << "[" << DEBUG_TYPE << "]: ") namespace flangomp { #define GEN_PASS_DEF_MAPINFOFINALIZATIONPASS #include "flang/Optimizer/OpenMP/Passes.h.inc" @@ -500,7 +502,38 @@ class MapInfoFinalizationPass // iterations from previous function scopes. localBoxAllocas.clear(); - // First, walk `omp.map.info` ops to see if any record members should be + // First, walk `omp.map.info` ops to see if any of them have varPtrs + // with an underlying type of fir.char, i.e a character + // with dynamic length. If so, check if they need bounds added. + func->walk([&](mlir::omp::MapInfoOp op) { + if (!op.getBounds().empty()) + return; + + mlir::Value varPtr = op.getVarPtr(); + mlir::Type underlyingVarType = fir::unwrapRefType(varPtr.getType()); + + if (!fir::characterWithDynamicLen(underlyingVarType)) + return; + + fir::factory::AddrAndBoundsInfo info = + fir::factory::getDataOperandBaseAddr( + builder, varPtr, /*isOptional=*/false, varPtr.getLoc()); + fir::ExtendedValue extendedValue = + hlfir::translateToExtendedValue(varPtr.getLoc(), builder, + hlfir::Entity{info.addr}, + /*continguousHint=*/true) + .first; + builder.setInsertionPoint(op); + llvm::SmallVector boundsOps = + fir::factory::genImplicitBoundsOps( + builder, info, extendedValue, + /*dataExvIsAssumedSize=*/false, varPtr.getLoc()); + + op.getBoundsMutable().append(boundsOps); + }); + + // Next, walk `omp.map.info` ops to see if any record members should be // implicitly mapped. // TODO/FIXME/UPDATE: I believe we need to add implicit capture of // allocatable members of arbitrary depths for this before we can diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 00a031e0dcad7..89bb76461ad2f 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -475,8 +475,7 @@ void OmpStructureChecker::HasInvalidDistributeNesting( violation = true; } else { // `distribute` region has to be strictly nested inside `teams` - if (!OmpDirectiveSet{llvm::omp::OMPD_teams, llvm::omp::OMPD_target_teams} - .test(GetContextParent().directive)) { + if (!llvm::omp::bottomTeamsSet.test(GetContextParent().directive)) { violation = true; } } @@ -506,8 +505,7 @@ void OmpStructureChecker::HasInvalidLoopBinding( if (llvm::omp::Directive::OMPD_loop == beginDir.v && CurrentDirectiveIsNested() && - OmpDirectiveSet{llvm::omp::OMPD_teams, llvm::omp::OMPD_target_teams}.test( - GetContextParent().directive)) { + llvm::omp::bottomTeamsSet.test(GetContextParent().directive)) { teamsBindingChecker( "`BIND(TEAMS)` must be specified since the `LOOP` region is " "strictly nested inside a `TEAMS` region."_err_en_US); @@ -698,7 +696,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) { HasInvalidDistributeNesting(x); HasInvalidLoopBinding(x); if (CurrentDirectiveIsNested() && - llvm::omp::topTeamsSet.test(GetContextParent().directive)) { + llvm::omp::bottomTeamsSet.test(GetContextParent().directive)) { HasInvalidTeamsNesting(beginDir.v, beginDir.source); } if ((beginDir.v == llvm::omp::Directive::OMPD_distribute_parallel_do_simd) || @@ -1141,7 +1139,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPBlockConstruct &x) { } if (CurrentDirectiveIsNested()) { - if (llvm::omp::topTeamsSet.test(GetContextParent().directive)) { + if (llvm::omp::bottomTeamsSet.test(GetContextParent().directive)) { HasInvalidTeamsNesting(beginDir.v, beginDir.source); } if (GetContext().directive == llvm::omp::Directive::OMPD_master) { diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 2bd70d7d2b935..a91a8c05f17b6 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -2467,6 +2467,15 @@ void OmpAttributeVisitor::ResolveOmpObject( name->ToString()); } } + if (ompFlag == Symbol::Flag::OmpDeclareTarget) { + if (symbol->IsFuncResult()) { + if (Symbol * func{currScope().symbol()}) { + CHECK(func->IsSubprogram()); + func->set(ompFlag); + name->symbol = func; + } + } + } if (GetContext().directive == llvm::omp::Directive::OMPD_target_data) { checkExclusivelists(symbol, Symbol::Flag::OmpUseDevicePtr, diff --git a/flang/lib/Semantics/unparse-with-symbols.cpp b/flang/lib/Semantics/unparse-with-symbols.cpp index 02afb89ae57fa..2716d88efb9fb 100644 --- a/flang/lib/Semantics/unparse-with-symbols.cpp +++ b/flang/lib/Semantics/unparse-with-symbols.cpp @@ -61,6 +61,14 @@ class SymbolDumpVisitor { currStmt_ = std::nullopt; } + bool Pre(const parser::OpenMPDeclareTargetConstruct &x) { + currStmt_ = x.source; + return true; + } + void Post(const parser::OpenMPDeclareTargetConstruct &) { + currStmt_ = std::nullopt; + } + private: std::optional currStmt_; // current statement we are processing std::multimap symbols_; // location to symbol diff --git a/flang/test/Driver/Inputs/config-7.cfg b/flang/test/Driver/Inputs/config-7.cfg new file mode 100644 index 0000000000000..2f41be663b282 --- /dev/null +++ b/flang/test/Driver/Inputs/config-7.cfg @@ -0,0 +1 @@ +-Werror diff --git a/flang/test/Driver/fcc_override.f90 b/flang/test/Driver/fcc_override.f90 new file mode 100644 index 0000000000000..2717d203c2ea3 --- /dev/null +++ b/flang/test/Driver/fcc_override.f90 @@ -0,0 +1,17 @@ +! RUN: env FCC_OVERRIDE_OPTIONS="#+-Os +-Oz +-O +-O3 +-Oignore +a +b +c xb Xa Omagic ^-### " %flang --target=x86_64-unknown-linux-gnu %s -O2 b -O3 2>&1 | FileCheck %s +! RUN: env FCC_OVERRIDE_OPTIONS="x-Werror +-g" %flang --target=x86_64-unknown-linux-gnu -Werror %s -c -### 2>&1 | FileCheck %s -check-prefix=RM-WERROR +! RUN: env FCC_OVERRIDE_OPTIONS="x-Werror" %flang --config=%S/Inputs/config-7.cfg -### %s -c 2>&1 | FileCheck %s -check-prefix=CONF + +! CHECK: "-fc1" +! CHECK-NOT: "-Oignore" +! CHECK: "-Omagic" +! CHECK-NOT: "-Oignore" + +! RM-WERROR: ### FCC_OVERRIDE_OPTIONS: x-Werror +-g +! RM-WERROR-NEXT: ### Deleting argument -Werror +! RM-WERROR-NEXT: ### Adding argument -g at end +! RM-WERROR-NOT: "-Werror" + +! Test that FCC_OVERRIDE_OPTIONS does not affect the options from config files. +! CONF: ### FCC_OVERRIDE_OPTIONS: x-Werror +! CONF: "-Werror" diff --git a/flang/test/Driver/flang-new-warning.f90 b/flang/test/Driver/flang-new-warning.f90 new file mode 100644 index 0000000000000..3d83c7ad3966c --- /dev/null +++ b/flang/test/Driver/flang-new-warning.f90 @@ -0,0 +1,3 @@ +! RUN: amdflang-new -c %s 2>&1 | FileCheck %s +! CHECK: warning: the 'amdflang-new' and 'flang-new' commmands have been deprecated; please use 'amdflang' instead +! XFAIL: * diff --git a/flang/test/Fir/box.fir b/flang/test/Fir/box.fir index 841e10d3debbc..ae63088fd9134 100644 --- a/flang/test/Fir/box.fir +++ b/flang/test/Fir/box.fir @@ -56,14 +56,12 @@ func.func @fa(%a : !fir.ref>) { // CHECK-LABEL: define void @b1( // CHECK-SAME: ptr nocapture %[[res:.*]], ptr nocapture %[[arg0:.*]], i64 %[[arg1:.*]]) func.func @b1(%arg0 : !fir.ref>, %arg1 : index) -> !fir.box> { - // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 } // CHECK: %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]] // CHECK: insertvalue {{.*}} undef, i64 %[[size]], 1 // CHECK: insertvalue {{.*}} i32 20240719, 2 // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0 %x = fir.embox %arg0 typeparams %arg1 : (!fir.ref>, index) -> !fir.box> - // CHECK: store {{.*}}, ptr %[[alloca]] - // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[res]], ptr %[[alloca]], i32 24, i1 false) + // CHECK: store {{.*}}, ptr %[[res]] return %x : !fir.box> } @@ -73,13 +71,11 @@ func.func @b1(%arg0 : !fir.ref>, %arg1 : index) -> !fir.box>>, %arg1 : index) -> !fir.box>> { %1 = fir.shape %arg1 : (index) -> !fir.shape<1> - // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } // CHECK: insertvalue {{.*}} { ptr undef, i64 ptrtoint (ptr getelementptr ([5 x i8], ptr null, i32 1) to i64), i32 20240719, i8 1, i8 40, i8 0, i8 0, {{.*}} }, i64 %[[arg1]], 7, 0, 1 // CHECK: insertvalue {{.*}} %{{.*}}, i64 ptrtoint (ptr getelementptr ([5 x i8], ptr null, i32 1) to i64), 7, 0, 2 // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0 %2 = fir.embox %arg0(%1) : (!fir.ref>>, !fir.shape<1>) -> !fir.box>> - // CHECK: store {{.*}}, ptr %[[alloca]] - // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[res]], ptr %[[alloca]], i32 48, i1 false) + // CHECK: store {{.*}}, ptr %[[res]] return %2 : !fir.box>> } @@ -88,7 +84,6 @@ func.func @b2(%arg0 : !fir.ref>>, %arg1 : index) -> // CHECK-SAME: ptr nocapture %[[res:.*]], ptr nocapture %[[arg0:.*]], i64 %[[arg1:.*]], i64 %[[arg2:.*]]) func.func @b3(%arg0 : !fir.ref>>, %arg1 : index, %arg2 : index) -> !fir.box>> { %1 = fir.shape %arg2 : (index) -> !fir.shape<1> - // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } // CHECK: %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]] // CHECK: insertvalue {{.*}} i64 %[[size]], 1 // CHECK: insertvalue {{.*}} i32 20240719, 2 @@ -96,8 +91,7 @@ func.func @b3(%arg0 : !fir.ref>>, %arg1 : index, %ar // CHECK: insertvalue {{.*}} i64 %[[size]], 7, 0, 2 // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0 %2 = fir.embox %arg0(%1) typeparams %arg1 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.box>> - // CHECK: store {{.*}}, ptr %[[alloca]] - // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[res]], ptr %[[alloca]], i32 48, i1 false) + // CHECK: store {{.*}}, ptr %[[res]] return %2 : !fir.box>> } @@ -107,7 +101,6 @@ func.func @b3(%arg0 : !fir.ref>>, %arg1 : index, %ar func.func @b4(%arg0 : !fir.ref>>, %arg1 : index) -> !fir.box>> { %c_7 = arith.constant 7 : index %1 = fir.shape %c_7 : (index) -> !fir.shape<1> - // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } // CHECK: %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]] // CHECK: insertvalue {{.*}} i64 %[[size]], 1 // CHECK: insertvalue {{.*}} i32 20240719, 2 @@ -115,8 +108,7 @@ func.func @b4(%arg0 : !fir.ref>>, %arg1 : index) -> // CHECK: insertvalue {{.*}} i64 %[[size]], 7, 0, 2 // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0 %x = fir.embox %arg0(%1) typeparams %arg1 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.box>> - // CHECK: store {{.*}}, ptr %[[alloca]] - // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[res]], ptr %[[alloca]], i32 48, i1 false) + // CHECK: store {{.*}}, ptr %[[res]] return %x : !fir.box>> } @@ -125,7 +117,8 @@ func.func @b4(%arg0 : !fir.ref>>, %arg1 : index) -> // CHECK-SAME: ptr nocapture %[[arg0:.*]], ptr %[[arg1:.*]]) func.func @b5(%arg0 : !fir.ref>>>, %arg1 : !fir.box>>) { fir.store %arg1 to %arg0 : !fir.ref>>> - // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %0, ptr %1, i32 72, i1 false) + // CHECK: %[[boxLoad:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[arg1]] + // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } %[[boxLoad]], ptr %[[arg0]] return } diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir index c48a605a1b2a9..22867cae5a7a5 100644 --- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir +++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir @@ -799,8 +799,8 @@ func.func @_QPs(%arg0: !fir.ref> {fir.bindc_name = "x"}) { //CHECK: omp.parallel { //CHECK: %[[CONST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 //CHECK: %[[ALLOCA_1:.*]] = llvm.alloca %[[CONST_1:.*]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr -//CHECK: %[[SIZE:.*]] = llvm.mlir.constant(24 : i32) : i32 -//CHECK: "llvm.intr.memcpy"(%[[ALLOCA_1]], %[[ALLOCA]], %[[SIZE]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () +//CHECK: %[[LOAD:.*]] = llvm.load %[[ALLOCA]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> +//CHECK: llvm.store %[[LOAD]], %[[ALLOCA_1]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr //CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA_1]][0, 0] : (!llvm.ptr) -> !llvm.ptr //CHECK: %[[LOAD_2:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> !llvm.ptr //CHECK: omp.terminator @@ -1160,7 +1160,8 @@ func.func @map_dtype_alloca_mem2(%arg0 : !fir.ref : (!llvm.ptr, !llvm.ptr, i32) -> () + // CHECK: %[[LOAD_9:.*]] = llvm.load %[[ARG_0]] : !llvm.ptr -> [[DESC_TY]] + // CHECK: llvm.store %[[LOAD_9]], %[[DTYPE_ALLOCATABLE_ALOCA]] : [[DESC_TY]], !llvm.ptr %1 = fir.load %arg0 : !fir.ref>,array_i:!fir.array<10xi32>,j:f32,array_j:!fir.box>>,k:i32}>>>> // CHECK: %[[GEP_DTYPE_BADDR:.*]] = llvm.getelementptr %[[DTYPE_ALLOCATABLE_ALOCA]][0, 0] : (!llvm.ptr) -> !llvm.ptr, [[DESC_TY]] // CHECK: %[[LOAD_DTYPE_BADDR:.*]] = llvm.load %[[GEP_DTYPE_BADDR]] : !llvm.ptr -> !llvm.ptr @@ -1172,7 +1173,8 @@ func.func @map_dtype_alloca_mem2(%arg0 : !fir.ref>>>, !fir.array) var_ptr_ptr(%3 : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%0) -> !fir.llvm_ptr>> // CHECK: %[[MAP_MEMBER_DESC:.*]] = omp.map.info var_ptr(%[[GEP_DTYPE_MEMBER]] : !llvm.ptr, [[DESC_TY2]]) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr %5 = omp.map.info var_ptr(%2 : !fir.ref>>>, !fir.box>>) map_clauses(tofrom) capture(ByRef) -> !fir.ref>>> - // CHECK: "llvm.intr.memcpy"(%[[DTYPE_ALLOCATABLE_ALOCA_2]], %[[ARG_0]], {{.*}}) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + // CHECK: %[[LOAD_16:.*]] = llvm.load %[[ARG_0]] : !llvm.ptr -> [[DESC_TY]] + // CHECK: llvm.store %[[LOAD_16]], %[[DTYPE_ALLOCATABLE_ALOCA_2]] : [[DESC_TY]], !llvm.ptr %6 = fir.load %arg0 : !fir.ref>,array_i:!fir.array<10xi32>,j:f32,array_j:!fir.box>>,k:i32}>>>> // CHECK: %[[GEP_DTYPE_BADDR:.*]] = llvm.getelementptr %[[DTYPE_ALLOCATABLE_ALOCA_2]][0, 0] : (!llvm.ptr) -> !llvm.ptr, [[DESC_TY]] // CHECK: %[[LOAD_DTYPE_BADDR:.*]] = llvm.load %[[GEP_DTYPE_BADDR]] : !llvm.ptr -> !llvm.ptr @@ -1208,7 +1210,8 @@ func.func @map_nested_dtype_alloca_mem(%arg0 : !fir.ref : (!llvm.ptr, !llvm.ptr, i32) -> () + // CHECK: %[[LOAD_11:.*]] = llvm.load %[[ARG_0]] : !llvm.ptr -> [[DESC_TY]] + // CHECK: llvm.store %[[LOAD_11]], %[[DTYPE_ALLOCATABLE_ALOCA]] : [[DESC_TY]], !llvm.ptr %1 = fir.load %arg0 : !fir.ref>,array_i:!fir.array<10xi32>,j:f32,array_j:!fir.box>>,k:i32,nest:!fir.type<_QFRecTy2{i:f32,array_i:!fir.array<10xi32>,array_k:!fir.box>>,k:i32}>}>>>> // CHECK: %[[GEP_DTYPE_BADDR:.*]] = llvm.getelementptr %[[DTYPE_ALLOCATABLE_ALOCA]][0, 0] : (!llvm.ptr) -> !llvm.ptr, [[DESC_TY]] // CHECK: %[[LOAD_GEP_DTYPE_BADDR:.*]] = llvm.load %[[GEP_DTYPE_BADDR]] : !llvm.ptr -> !llvm.ptr @@ -1222,7 +1225,8 @@ func.func @map_nested_dtype_alloca_mem(%arg0 : !fir.ref>>>, !fir.array) var_ptr_ptr(%4 : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%0) -> !fir.llvm_ptr>> // CHECK: %[[MAP_NESTED_MEMBER_DESC:.*]] = omp.map.info var_ptr(%[[GEP_NESTED_DTYPE_ALLOCATABLE_MEMBER]] : !llvm.ptr, [[DESC_TY2]]) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr %6 = omp.map.info var_ptr(%3 : !fir.ref>>>, !fir.box>>) map_clauses(tofrom) capture(ByRef) -> !fir.ref>>> - // CHECK: "llvm.intr.memcpy"(%[[DTYPE_ALLOCATABLE_ALOCA_2]], %[[ARG_0]], {{.*}}) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + // CHECK: %[[LOAD_19:.*]] = llvm.load %[[ARG_0]] : !llvm.ptr -> [[DESC_TY]] + // CHECK: llvm.store %[[LOAD_19]], %[[DTYPE_ALLOCATABLE_ALOCA_2]] : [[DESC_TY]], !llvm.ptr // CHECK: %[[GEP_DTYPE_BADDR:.*]] = llvm.getelementptr %[[DTYPE_ALLOCATABLE_ALOCA_2]][0, 0] : (!llvm.ptr) -> !llvm.ptr, [[DESC_TY]] // CHECK: %[[LOAD_GEP_DTYPE_BADDR:.*]] = llvm.load %[[GEP_DTYPE_BADDR]] : !llvm.ptr -> !llvm.ptr %7 = fir.load %arg0 : !fir.ref>,array_i:!fir.array<10xi32>,j:f32,array_j:!fir.box>>,k:i32,nest:!fir.type<_QFRecTy2{i:f32,array_i:!fir.array<10xi32>,array_k:!fir.box>>,k:i32}>}>>>> diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir index 6d7a4a09918e5..6185239266752 100644 --- a/flang/test/Fir/convert-to-llvm.fir +++ b/flang/test/Fir/convert-to-llvm.fir @@ -862,8 +862,8 @@ func.func @test_store_box(%array : !fir.ref>>, %box // CHECK-LABEL: llvm.func @test_store_box // CHECK-SAME: (%[[arg0:.*]]: !llvm.ptr, // CHECK-SAME: %[[arg1:.*]]: !llvm.ptr) { -// CHECK-NEXT: %[[size:.*]] = llvm.mlir.constant(72 : i32) : i32 -// CHECK-NEXT: "llvm.intr.memcpy"(%[[arg0]], %[[arg1]], %[[size]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () +// CHECK-NEXT: %[[box_to_store:.*]] = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<2 x array<3 x i{{.*}}>>)> +// CHECK-NEXT: llvm.store %[[box_to_store]], %[[arg0]] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<2 x array<3 x i{{.*}}>>)>, !llvm.ptr // CHECK-NEXT: llvm.return // CHECK-NEXT: } @@ -875,17 +875,15 @@ func.func @store_unlimited_polymorphic_box(%arg0 : !fir.class, %arg1 : !fi fir.store %arg3 to %arg3r : !fir.ref>> return } -// CHECK: llvm.func @store_unlimited_polymorphic_box(%[[VAL_0:.*]]: !llvm.ptr, %[[VAL_1:.*]]: !llvm.ptr, %[[VAL_2:.*]]: !llvm.ptr, %[[VAL_3:.*]]: !llvm.ptr, %[[VAL_4:.*]]: !llvm.ptr, %[[VAL_5:.*]]: !llvm.ptr, %[[VAL_6:.*]]: !llvm.ptr, %[[VAL_7:.*]]: !llvm.ptr) { -// CHECK: %[[VAL_8:.*]] = llvm.mlir.constant(40 : i32) : i32 -// CHECK: "llvm.intr.memcpy"(%[[VAL_4]], %[[VAL_0]], %[[VAL_8]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () -// CHECK: %[[VAL_9:.*]] = llvm.mlir.constant(64 : i32) : i32 -// CHECK: "llvm.intr.memcpy"(%[[VAL_5]], %[[VAL_1]], %[[VAL_9]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () -// CHECK: %[[VAL_10:.*]] = llvm.mlir.constant(40 : i32) : i32 -// CHECK: "llvm.intr.memcpy"(%[[VAL_6]], %[[VAL_2]], %[[VAL_10]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () -// CHECK: %[[VAL_11:.*]] = llvm.mlir.constant(64 : i32) : i32 -// CHECK: "llvm.intr.memcpy"(%[[VAL_7]], %[[VAL_3]], %[[VAL_11]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () -// CHECK: llvm.return -// CHECK: } +// CHECK-LABEL: llvm.func @store_unlimited_polymorphic_box( +// CHECK: %[[VAL_8:.*]] = llvm.load %{{.*}} : !llvm.ptr -> !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)> +// CHECK: llvm.store %[[VAL_8]], %{{.*}} : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)>, !llvm.ptr +// CHECK: %[[VAL_9:.*]] = llvm.load %{{.*}} : !llvm.ptr -> !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i{{.*}}>>, ptr, array<1 x i{{.*}}>)> +// CHECK: llvm.store %[[VAL_9]], %{{.*}} : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i{{.*}}>>, ptr, array<1 x i{{.*}}>)>, !llvm.ptr +// CHECK: %[[VAL_10:.*]] = llvm.load %{{.*}} : !llvm.ptr -> !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)> +// CHECK: llvm.store %[[VAL_10]], %{{.*}} : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)>, !llvm.ptr +// CHECK: %[[VAL_11:.*]] = llvm.load %{{.*}}: !llvm.ptr +// CHECK: llvm.store %[[VAL_11]], %{{.*}} : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i{{.*}}>>, ptr, array<1 x i{{.*}}>)>, !llvm.ptr // ----- @@ -937,8 +935,8 @@ func.func @test_load_box(%addr : !fir.ref>>) { // GENERIC-NEXT: %[[box_copy:.*]] = llvm.alloca %[[c1]] x !llvm.struct<([[DESC_TYPE:.*]])> // AMDGPU-NEXT: %[[alloca_box_copy:.*]] = llvm.alloca %[[c1]] x !llvm.struct<([[DESC_TYPE:.*]])>{{.*}} : (i32) -> !llvm.ptr<5> // AMDGPU-NEXT: %[[box_copy:.*]] = llvm.addrspacecast %[[alloca_box_copy]] : !llvm.ptr<5> to !llvm.ptr -// CHECK-NEXT: %[[size:.*]] = llvm.mlir.constant(48 : i32) : i32 -// CHECK-NEXT: "llvm.intr.memcpy"(%[[box_copy]], %[[arg0]], %[[size]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () +// CHECK-NEXT: %[[box_val:.*]] = llvm.load %[[arg0]] : !llvm.ptr -> !llvm.struct<([[DESC_TYPE]])> +// CHECK-NEXT: llvm.store %[[box_val]], %[[box_copy]] : !llvm.struct<([[DESC_TYPE]])>, !llvm.ptr // CHECK-NEXT: llvm.call @takes_box(%[[box_copy]]) : (!llvm.ptr) -> () // CHECK-NEXT: llvm.return // CHECK-NEXT: } diff --git a/flang/test/Fir/embox-char.fir b/flang/test/Fir/embox-char.fir index efb069f96520d..bf8344dbb60fc 100644 --- a/flang/test/Fir/embox-char.fir +++ b/flang/test/Fir/embox-char.fir @@ -1,10 +1,3 @@ -// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py - -// The script is designed to make adding checks to -// a test case fast, it is *not* designed to be authoritative -// about what constitutes a good test! The CHECK should be -// minimized and named to reflect the test intent. - // Test that the offset of the first element of the slice // is computed in elements of the type used for the GEP // computing the base of the slice. @@ -17,40 +10,42 @@ // print *, x(2,:) // end subroutine -// CHECK: llvm.func @test_char4(%[[VAL_0:.*]]: !llvm.ptr, %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64, %[[VAL_5:.*]]: i64, %[[VAL_6:.*]]: i64) { +// CHECK-LABEL: llvm.func @test_char4( +// CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr, +// CHECK-SAME: %[[VAL_1_SLICE_LB0:.*]]: i64, %[[VAL_2_SLICE_EX0:.*]]: i64, %[[VAL_3_SLICE_ST0:.*]]: i64, %[[VAL_4_SLICE_LB1:.*]]: i64, %[[VAL_5_SLICE_EX1:.*]]: i64, %[[VAL_6_SLICE_ST1:.*]]: i64) { // CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %[[VAL_8:.*]] = llvm.alloca %[[VAL_7]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr // CHECK: %[[VAL_9:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %[[VAL_10:.*]] = llvm.alloca %[[VAL_9]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr // CHECK: %[[VAL_11:.*]] = llvm.mlir.constant(0 : index) : i64 // CHECK: %[[VAL_12:.*]] = llvm.mlir.constant(1 : index) : i64 -// CHECK: %[[VAL_13:.*]] = llvm.mlir.constant(4 : index) : i64 -// CHECK: %[[VAL_14:.*]] = llvm.mlir.constant(72 : i32) : i32 -// CHECK: "llvm.intr.memcpy"(%[[VAL_10]], %[[VAL_0]], %[[VAL_14]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () +// CHECK: %[[VAL_13_WIDTH:.*]] = llvm.mlir.constant(4 : index) : i64 +// CHECK: %[[VAL_14:.*]] = llvm.load %[[VAL_0]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: llvm.store %[[VAL_14]], %[[VAL_10]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>, !llvm.ptr // CHECK: %[[VAL_15:.*]] = llvm.getelementptr %[[VAL_10]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_16:.*]] = llvm.load %[[VAL_15]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_16_BYTESIZE:.*]] = llvm.load %[[VAL_15]] : !llvm.ptr -> i64 // CHECK: %[[VAL_17:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_18:.*]] = llvm.load %[[VAL_17]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_18_LB1:.*]] = llvm.load %[[VAL_17]] : !llvm.ptr -> i64 // CHECK: %[[VAL_19:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_20:.*]] = llvm.load %[[VAL_19]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_20_EX1:.*]] = llvm.load %[[VAL_19]] : !llvm.ptr -> i64 // CHECK: %[[VAL_21:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_22:.*]] = llvm.load %[[VAL_21]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_22_ST1:.*]] = llvm.load %[[VAL_21]] : !llvm.ptr -> i64 // CHECK: %[[VAL_23:.*]] = llvm.getelementptr %[[VAL_10]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_24:.*]] = llvm.load %[[VAL_23]] : !llvm.ptr -> !llvm.ptr +// CHECK: %[[VAL_24_BASEPTR:.*]] = llvm.load %[[VAL_23]] : !llvm.ptr -> !llvm.ptr // CHECK: %[[VAL_25:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_26:.*]] = llvm.load %[[VAL_25]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_26_LB0:.*]] = llvm.load %[[VAL_25]] : !llvm.ptr -> i64 // CHECK: %[[VAL_27:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_28:.*]] = llvm.load %[[VAL_27]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_28_EX0:.*]] = llvm.load %[[VAL_27]] : !llvm.ptr -> i64 // CHECK: %[[VAL_29:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_30:.*]] = llvm.load %[[VAL_29]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_31:.*]] = llvm.sdiv %[[VAL_16]], %[[VAL_13]] : i64 +// CHECK: %[[VAL_30_ST0:.*]] = llvm.load %[[VAL_29]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_31_LEN:.*]] = llvm.sdiv %[[VAL_16_BYTESIZE]], %[[VAL_13_WIDTH]] : i64 // CHECK: %[[VAL_32:.*]] = llvm.mlir.constant(44 : i32) : i32 // CHECK: %[[VAL_33:.*]] = llvm.mlir.zero : !llvm.ptr // CHECK: %[[VAL_34:.*]] = llvm.getelementptr %[[VAL_33]][1] : (!llvm.ptr) -> !llvm.ptr, i32 // CHECK: %[[VAL_35:.*]] = llvm.ptrtoint %[[VAL_34]] : !llvm.ptr to i64 -// CHECK: %[[VAL_36:.*]] = llvm.mul %[[VAL_35]], %[[VAL_31]] : i64 +// CHECK: %[[VAL_36_BYTESIZE:.*]] = llvm.mul %[[VAL_35]], %[[VAL_31_LEN]] : i64 // CHECK: %[[VAL_37:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_38:.*]] = llvm.insertvalue %[[VAL_36]], %[[VAL_37]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_38:.*]] = llvm.insertvalue %[[VAL_36_BYTESIZE]], %[[VAL_37]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> // CHECK: %[[VAL_39:.*]] = llvm.mlir.constant(20240719 : i32) : i32 // CHECK: %[[VAL_40:.*]] = llvm.insertvalue %[[VAL_39]], %[[VAL_38]][2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> // CHECK: %[[VAL_41:.*]] = llvm.mlir.constant(2 : i32) : i32 @@ -64,39 +59,39 @@ // CHECK: %[[VAL_49:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[VAL_50:.*]] = llvm.trunc %[[VAL_49]] : i32 to i8 // CHECK: %[[VAL_51:.*]] = llvm.insertvalue %[[VAL_50]], %[[VAL_48]][6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_52:.*]] = llvm.mlir.constant(0 : i64) : i64 +// CHECK: %[[VAL_52_c0:.*]] = llvm.mlir.constant(0 : i64) : i64 // CHECK: %[[VAL_53:.*]] = llvm.mlir.constant(1 : i64) : i64 -// CHECK: %[[VAL_54:.*]] = llvm.sub %[[VAL_1]], %[[VAL_26]] : i64 -// CHECK: %[[VAL_55:.*]] = llvm.mul %[[VAL_54]], %[[VAL_31]] : i64 -// CHECK: %[[VAL_56:.*]] = llvm.add %[[VAL_55]], %[[VAL_52]] : i64 -// CHECK: %[[VAL_57:.*]] = llvm.sub %[[VAL_2]], %[[VAL_1]] : i64 -// CHECK: %[[VAL_58:.*]] = llvm.add %[[VAL_57]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_59:.*]] = llvm.sdiv %[[VAL_58]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_60:.*]] = llvm.icmp "sgt" %[[VAL_59]], %[[VAL_52]] : i64 -// CHECK: %[[VAL_61:.*]] = llvm.select %[[VAL_60]], %[[VAL_59]], %[[VAL_52]] : i1, i64 +// CHECK: %[[VAL_54:.*]] = llvm.sub %[[VAL_1_SLICE_LB0]], %[[VAL_26_LB0]] : i64 +// CHECK: %[[VAL_55:.*]] = llvm.mul %[[VAL_54]], %[[VAL_31_LEN]] : i64 +// CHECK: %[[VAL_56_SLICE_OFF0:.*]] = llvm.add %[[VAL_55]], %[[VAL_52_c0]] : i64 +// CHECK: %[[VAL_57:.*]] = llvm.sub %[[VAL_2_SLICE_EX0]], %[[VAL_1_SLICE_LB0]] : i64 +// CHECK: %[[VAL_58:.*]] = llvm.add %[[VAL_57]], %[[VAL_3_SLICE_ST0]] : i64 +// CHECK: %[[VAL_59:.*]] = llvm.sdiv %[[VAL_58]], %[[VAL_3_SLICE_ST0]] : i64 +// CHECK: %[[VAL_60:.*]] = llvm.icmp "sgt" %[[VAL_59]], %[[VAL_52_c0]] : i64 +// CHECK: %[[VAL_61:.*]] = llvm.select %[[VAL_60]], %[[VAL_59]], %[[VAL_52_c0]] : i1, i64 // CHECK: %[[VAL_62:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_51]][7, 0, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> // CHECK: %[[VAL_63:.*]] = llvm.insertvalue %[[VAL_61]], %[[VAL_62]][7, 0, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_64:.*]] = llvm.mul %[[VAL_36]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_64:.*]] = llvm.mul %[[VAL_36_BYTESIZE]], %[[VAL_3_SLICE_ST0]] : i64 // CHECK: %[[VAL_65:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_63]][7, 0, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_66:.*]] = llvm.mul %[[VAL_36]], %[[VAL_28]] : i64 -// CHECK: %[[VAL_67:.*]] = llvm.mul %[[VAL_31]], %[[VAL_28]] : i64 -// CHECK: %[[VAL_68:.*]] = llvm.sub %[[VAL_4]], %[[VAL_18]] : i64 -// CHECK: %[[VAL_69:.*]] = llvm.mul %[[VAL_68]], %[[VAL_67]] : i64 -// CHECK: %[[VAL_70:.*]] = llvm.add %[[VAL_69]], %[[VAL_56]] : i64 -// CHECK: %[[VAL_71:.*]] = llvm.sub %[[VAL_5]], %[[VAL_4]] : i64 -// CHECK: %[[VAL_72:.*]] = llvm.add %[[VAL_71]], %[[VAL_6]] : i64 -// CHECK: %[[VAL_73:.*]] = llvm.sdiv %[[VAL_72]], %[[VAL_6]] : i64 -// CHECK: %[[VAL_74:.*]] = llvm.icmp "sgt" %[[VAL_73]], %[[VAL_52]] : i64 -// CHECK: %[[VAL_75:.*]] = llvm.select %[[VAL_74]], %[[VAL_73]], %[[VAL_52]] : i1, i64 +// CHECK: %[[VAL_66:.*]] = llvm.mul %[[VAL_36_BYTESIZE]], %[[VAL_28_EX0]] : i64 +// CHECK: %[[VAL_67:.*]] = llvm.mul %[[VAL_31_LEN]], %[[VAL_28_EX0]] : i64 +// CHECK: %[[VAL_68:.*]] = llvm.sub %[[VAL_4_SLICE_LB1]], %[[VAL_18_LB1]] : i64 +// CHECK: %[[VAL_69_SLICE_OFF1:.*]] = llvm.mul %[[VAL_68]], %[[VAL_67]] : i64 +// CHECK: %[[VAL_70_OFFSET:.*]] = llvm.add %[[VAL_69_SLICE_OFF1]], %[[VAL_56_SLICE_OFF0]] : i64 +// CHECK: %[[VAL_71:.*]] = llvm.sub %[[VAL_5_SLICE_EX1]], %[[VAL_4_SLICE_LB1]] : i64 +// CHECK: %[[VAL_72:.*]] = llvm.add %[[VAL_71]], %[[VAL_6_SLICE_ST1]] : i64 +// CHECK: %[[VAL_73:.*]] = llvm.sdiv %[[VAL_72]], %[[VAL_6_SLICE_ST1]] : i64 +// CHECK: %[[VAL_74:.*]] = llvm.icmp "sgt" %[[VAL_73]], %[[VAL_52_c0]] : i64 +// CHECK: %[[VAL_75:.*]] = llvm.select %[[VAL_74]], %[[VAL_73]], %[[VAL_52_c0]] : i1, i64 // CHECK: %[[VAL_76:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_65]][7, 1, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> // CHECK: %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_75]], %[[VAL_76]][7, 1, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_78:.*]] = llvm.mul %[[VAL_66]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_78:.*]] = llvm.mul %[[VAL_66]], %[[VAL_6_SLICE_ST1]] : i64 // CHECK: %[[VAL_79:.*]] = llvm.insertvalue %[[VAL_78]], %[[VAL_77]][7, 1, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_80:.*]] = llvm.mul %[[VAL_66]], %[[VAL_20]] : i64 -// CHECK: %[[VAL_81:.*]] = llvm.mul %[[VAL_67]], %[[VAL_20]] : i64 -// CHECK: %[[VAL_82:.*]] = llvm.getelementptr %[[VAL_24]]{{\[}}%[[VAL_70]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32 -// CHECK: %[[VAL_83:.*]] = llvm.insertvalue %[[VAL_82]], %[[VAL_79]][0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: llvm.store %[[VAL_83]], %[[VAL_8]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>, !llvm.ptr +// CHECK: %[[VAL_80:.*]] = llvm.mul %[[VAL_66]], %[[VAL_20_EX1]] : i64 +// CHECK: %[[VAL_81:.*]] = llvm.mul %[[VAL_67]], %[[VAL_20_EX1]] : i64 +// CHECK: %[[VAL_82:.*]] = llvm.getelementptr %[[VAL_24_BASEPTR]]{{\[}}%[[VAL_70_OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32 +// CHECK: %[[VAL_84:.*]] = llvm.insertvalue %[[VAL_82]], %[[VAL_79]][0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: llvm.store %[[VAL_84]], %[[VAL_8]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>, !llvm.ptr // CHECK: llvm.return // CHECK: } func.func @test_char4(%arg0: !fir.ref>>>>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index) { @@ -113,84 +108,86 @@ func.func @test_char4(%arg0: !fir.ref>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr // CHECK: %[[VAL_9:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %[[VAL_10:.*]] = llvm.alloca %[[VAL_9]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr // CHECK: %[[VAL_11:.*]] = llvm.mlir.constant(0 : index) : i64 -// CHECK: %[[VAL_12:.*]] = llvm.mlir.constant(1 : index) : i64 -// CHECK: %[[VAL_13:.*]] = llvm.mlir.constant(72 : i32) : i32 -// CHECK: "llvm.intr.memcpy"(%[[VAL_10]], %[[VAL_0]], %[[VAL_13]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () -// CHECK: %[[VAL_14:.*]] = llvm.getelementptr %[[VAL_10]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_15:.*]] = llvm.load %[[VAL_14]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_16:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_17:.*]] = llvm.load %[[VAL_16]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_18:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_19:.*]] = llvm.load %[[VAL_18]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_20:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_21:.*]] = llvm.load %[[VAL_20]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_22:.*]] = llvm.getelementptr %[[VAL_10]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_23:.*]] = llvm.load %[[VAL_22]] : !llvm.ptr -> !llvm.ptr -// CHECK: %[[VAL_24:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_25:.*]] = llvm.load %[[VAL_24]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_26:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_27:.*]] = llvm.load %[[VAL_26]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_28:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_29:.*]] = llvm.load %[[VAL_28]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_30:.*]] = llvm.mlir.constant(40 : i32) : i32 -// CHECK: %[[VAL_31:.*]] = llvm.mlir.zero : !llvm.ptr -// CHECK: %[[VAL_32:.*]] = llvm.getelementptr %[[VAL_31]][1] : (!llvm.ptr) -> !llvm.ptr, i8 -// CHECK: %[[VAL_33:.*]] = llvm.ptrtoint %[[VAL_32]] : !llvm.ptr to i64 -// CHECK: %[[VAL_34:.*]] = llvm.mul %[[VAL_33]], %[[VAL_15]] : i64 -// CHECK: %[[VAL_35:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_36:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_35]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_37:.*]] = llvm.mlir.constant(20240719 : i32) : i32 -// CHECK: %[[VAL_38:.*]] = llvm.insertvalue %[[VAL_37]], %[[VAL_36]][2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_39:.*]] = llvm.mlir.constant(2 : i32) : i32 -// CHECK: %[[VAL_40:.*]] = llvm.trunc %[[VAL_39]] : i32 to i8 -// CHECK: %[[VAL_41:.*]] = llvm.insertvalue %[[VAL_40]], %[[VAL_38]][3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_42:.*]] = llvm.trunc %[[VAL_30]] : i32 to i8 -// CHECK: %[[VAL_43:.*]] = llvm.insertvalue %[[VAL_42]], %[[VAL_41]][4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_44:.*]] = llvm.mlir.constant(0 : i32) : i32 -// CHECK: %[[VAL_45:.*]] = llvm.trunc %[[VAL_44]] : i32 to i8 -// CHECK: %[[VAL_46:.*]] = llvm.insertvalue %[[VAL_45]], %[[VAL_43]][5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_47:.*]] = llvm.mlir.constant(0 : i32) : i32 -// CHECK: %[[VAL_48:.*]] = llvm.trunc %[[VAL_47]] : i32 to i8 -// CHECK: %[[VAL_49:.*]] = llvm.insertvalue %[[VAL_48]], %[[VAL_46]][6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_50:.*]] = llvm.mlir.constant(0 : i64) : i64 -// CHECK: %[[VAL_51:.*]] = llvm.mlir.constant(1 : i64) : i64 -// CHECK: %[[VAL_52:.*]] = llvm.sub %[[VAL_1]], %[[VAL_25]] : i64 -// CHECK: %[[VAL_53:.*]] = llvm.mul %[[VAL_52]], %[[VAL_15]] : i64 -// CHECK: %[[VAL_54:.*]] = llvm.add %[[VAL_53]], %[[VAL_50]] : i64 -// CHECK: %[[VAL_55:.*]] = llvm.sub %[[VAL_2]], %[[VAL_1]] : i64 -// CHECK: %[[VAL_56:.*]] = llvm.add %[[VAL_55]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_57:.*]] = llvm.sdiv %[[VAL_56]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_58:.*]] = llvm.icmp "sgt" %[[VAL_57]], %[[VAL_50]] : i64 -// CHECK: %[[VAL_59:.*]] = llvm.select %[[VAL_58]], %[[VAL_57]], %[[VAL_50]] : i1, i64 -// CHECK: %[[VAL_60:.*]] = llvm.insertvalue %[[VAL_51]], %[[VAL_49]][7, 0, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_61:.*]] = llvm.insertvalue %[[VAL_59]], %[[VAL_60]][7, 0, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_62:.*]] = llvm.mul %[[VAL_34]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_63:.*]] = llvm.insertvalue %[[VAL_62]], %[[VAL_61]][7, 0, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_64:.*]] = llvm.mul %[[VAL_34]], %[[VAL_27]] : i64 -// CHECK: %[[VAL_65:.*]] = llvm.mul %[[VAL_15]], %[[VAL_27]] : i64 -// CHECK: %[[VAL_66:.*]] = llvm.sub %[[VAL_4]], %[[VAL_17]] : i64 -// CHECK: %[[VAL_67:.*]] = llvm.mul %[[VAL_66]], %[[VAL_65]] : i64 -// CHECK: %[[VAL_68:.*]] = llvm.add %[[VAL_67]], %[[VAL_54]] : i64 -// CHECK: %[[VAL_69:.*]] = llvm.sub %[[VAL_5]], %[[VAL_4]] : i64 -// CHECK: %[[VAL_70:.*]] = llvm.add %[[VAL_69]], %[[VAL_6]] : i64 -// CHECK: %[[VAL_71:.*]] = llvm.sdiv %[[VAL_70]], %[[VAL_6]] : i64 -// CHECK: %[[VAL_72:.*]] = llvm.icmp "sgt" %[[VAL_71]], %[[VAL_50]] : i64 -// CHECK: %[[VAL_73:.*]] = llvm.select %[[VAL_72]], %[[VAL_71]], %[[VAL_50]] : i1, i64 -// CHECK: %[[VAL_74:.*]] = llvm.insertvalue %[[VAL_51]], %[[VAL_63]][7, 1, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_75:.*]] = llvm.insertvalue %[[VAL_73]], %[[VAL_74]][7, 1, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_76:.*]] = llvm.mul %[[VAL_64]], %[[VAL_6]] : i64 -// CHECK: %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_76]], %[[VAL_75]][7, 1, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_78:.*]] = llvm.mul %[[VAL_64]], %[[VAL_19]] : i64 -// CHECK: %[[VAL_79:.*]] = llvm.mul %[[VAL_65]], %[[VAL_19]] : i64 -// CHECK: %[[VAL_80:.*]] = llvm.getelementptr %[[VAL_23]]{{\[}}%[[VAL_68]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8 -// CHECK: %[[VAL_81:.*]] = llvm.insertvalue %[[VAL_80]], %[[VAL_77]][0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: llvm.store %[[VAL_81]], %[[VAL_8]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>, !llvm.ptr +// CHECK: %[[VAL_12_c1:.*]] = llvm.mlir.constant(1 : index) : i64 +// CHECK: %[[VAL_14:.*]] = llvm.load %[[VAL_0]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: llvm.store %[[VAL_14]], %[[VAL_10]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>, !llvm.ptr +// CHECK: %[[VAL_15:.*]] = llvm.getelementptr %[[VAL_10]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_16_BYTESIZE:.*]] = llvm.load %[[VAL_15]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_17:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_18_LB1:.*]] = llvm.load %[[VAL_17]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_19:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_20_EX1:.*]] = llvm.load %[[VAL_19]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_21:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_22_ST1:.*]] = llvm.load %[[VAL_21]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_23:.*]] = llvm.getelementptr %[[VAL_10]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_24_BASEPTR:.*]] = llvm.load %[[VAL_23]] : !llvm.ptr -> !llvm.ptr +// CHECK: %[[VAL_25:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_26_LB0:.*]] = llvm.load %[[VAL_25]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_27:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_28_EX0:.*]] = llvm.load %[[VAL_27]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_29:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_30_ST0:.*]] = llvm.load %[[VAL_29]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_32:.*]] = llvm.mlir.constant(40 : i32) : i32 +// CHECK: %[[VAL_33:.*]] = llvm.mlir.zero : !llvm.ptr +// CHECK: %[[VAL_34:.*]] = llvm.getelementptr %[[VAL_33]][1] : (!llvm.ptr) -> !llvm.ptr, i8 +// CHECK: %[[VAL_35:.*]] = llvm.ptrtoint %[[VAL_34]] : !llvm.ptr to i64 +// CHECK: %[[VAL_36_BYTESIZE:.*]] = llvm.mul %[[VAL_35]], %[[VAL_16_BYTESIZE]] : i64 +// CHECK: %[[VAL_37:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_38:.*]] = llvm.insertvalue %[[VAL_36_BYTESIZE]], %[[VAL_37]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_39:.*]] = llvm.mlir.constant(20240719 : i32) : i32 +// CHECK: %[[VAL_40:.*]] = llvm.insertvalue %[[VAL_39]], %[[VAL_38]][2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_41:.*]] = llvm.mlir.constant(2 : i32) : i32 +// CHECK: %[[VAL_42:.*]] = llvm.trunc %[[VAL_41]] : i32 to i8 +// CHECK: %[[VAL_43:.*]] = llvm.insertvalue %[[VAL_42]], %[[VAL_40]][3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_44:.*]] = llvm.trunc %[[VAL_32]] : i32 to i8 +// CHECK: %[[VAL_45:.*]] = llvm.insertvalue %[[VAL_44]], %[[VAL_43]][4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_46:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_47:.*]] = llvm.trunc %[[VAL_46]] : i32 to i8 +// CHECK: %[[VAL_48:.*]] = llvm.insertvalue %[[VAL_47]], %[[VAL_45]][5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_49:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_50:.*]] = llvm.trunc %[[VAL_49]] : i32 to i8 +// CHECK: %[[VAL_51:.*]] = llvm.insertvalue %[[VAL_50]], %[[VAL_48]][6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_52_c0:.*]] = llvm.mlir.constant(0 : i64) : i64 +// CHECK: %[[VAL_53:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VAL_54:.*]] = llvm.sub %[[VAL_1_SLICE_LB0]], %[[VAL_26_LB0]] : i64 +// CHECK: %[[VAL_55:.*]] = llvm.mul %[[VAL_54]], %[[VAL_16_BYTESIZE]] : i64 +// CHECK: %[[VAL_56_SLICE_OFF0:.*]] = llvm.add %[[VAL_55]], %[[VAL_52_c0]] : i64 +// CHECK: %[[VAL_57:.*]] = llvm.sub %[[VAL_2_SLICE_EX0]], %[[VAL_1_SLICE_LB0]] : i64 +// CHECK: %[[VAL_58:.*]] = llvm.add %[[VAL_57]], %[[VAL_3_SLICE_ST0]] : i64 +// CHECK: %[[VAL_59:.*]] = llvm.sdiv %[[VAL_58]], %[[VAL_3_SLICE_ST0]] : i64 +// CHECK: %[[VAL_60:.*]] = llvm.icmp "sgt" %[[VAL_59]], %[[VAL_52_c0]] : i64 +// CHECK: %[[VAL_61:.*]] = llvm.select %[[VAL_60]], %[[VAL_59]], %[[VAL_52_c0]] : i1, i64 +// CHECK: %[[VAL_62:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_51]][7, 0, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_63:.*]] = llvm.insertvalue %[[VAL_61]], %[[VAL_62]][7, 0, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_64:.*]] = llvm.mul %[[VAL_36_BYTESIZE]], %[[VAL_3_SLICE_ST0]] : i64 +// CHECK: %[[VAL_65:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_63]][7, 0, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_66:.*]] = llvm.mul %[[VAL_36_BYTESIZE]], %[[VAL_28_EX0]] : i64 +// CHECK: %[[VAL_67:.*]] = llvm.mul %[[VAL_16_BYTESIZE]], %[[VAL_28_EX0]] : i64 +// CHECK: %[[VAL_68:.*]] = llvm.sub %[[VAL_4_SLICE_LB1]], %[[VAL_18_LB1]] : i64 +// CHECK: %[[VAL_69_SLICE_OFF1:.*]] = llvm.mul %[[VAL_68]], %[[VAL_67]] : i64 +// CHECK: %[[VAL_70_OFFSET:.*]] = llvm.add %[[VAL_69_SLICE_OFF1]], %[[VAL_56_SLICE_OFF0]] : i64 +// CHECK: %[[VAL_71:.*]] = llvm.sub %[[VAL_5_SLICE_EX1]], %[[VAL_4_SLICE_LB1]] : i64 +// CHECK: %[[VAL_72:.*]] = llvm.add %[[VAL_71]], %[[VAL_6_SLICE_ST1]] : i64 +// CHECK: %[[VAL_73:.*]] = llvm.sdiv %[[VAL_72]], %[[VAL_6_SLICE_ST1]] : i64 +// CHECK: %[[VAL_74:.*]] = llvm.icmp "sgt" %[[VAL_73]], %[[VAL_52_c0]] : i64 +// CHECK: %[[VAL_75:.*]] = llvm.select %[[VAL_74]], %[[VAL_73]], %[[VAL_52_c0]] : i1, i64 +// CHECK: %[[VAL_76:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_65]][7, 1, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_75]], %[[VAL_76]][7, 1, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_78:.*]] = llvm.mul %[[VAL_66]], %[[VAL_6_SLICE_ST1]] : i64 +// CHECK: %[[VAL_79:.*]] = llvm.insertvalue %[[VAL_78]], %[[VAL_77]][7, 1, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_80:.*]] = llvm.mul %[[VAL_66]], %[[VAL_20_EX1]] : i64 +// CHECK: %[[VAL_81:.*]] = llvm.mul %[[VAL_67]], %[[VAL_20_EX1]] : i64 +// CHECK: %[[VAL_82:.*]] = llvm.getelementptr %[[VAL_24_BASEPTR]]{{\[}}%[[VAL_70_OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8 +// CHECK: %[[VAL_84:.*]] = llvm.insertvalue %[[VAL_82]], %[[VAL_79]][0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: llvm.store %[[VAL_84]], %[[VAL_8]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>, !llvm.ptr // CHECK: llvm.return // CHECK: } func.func @test_char1(%arg0: !fir.ref>>>>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index) { diff --git a/flang/test/Fir/polymorphic.fir b/flang/test/Fir/polymorphic.fir index f9cf6fab6b707..cb40bbd44494a 100644 --- a/flang/test/Fir/polymorphic.fir +++ b/flang/test/Fir/polymorphic.fir @@ -14,7 +14,8 @@ func.func @_QMpolymorphic_testPtest_allocate_unlimited_polymorphic_non_derived() // CHECK: %[[MEM:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } // CHECK: %[[DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 0, i32 20240719, i8 0, i8 -1, i8 1, i8 1, ptr null, [1 x i64] zeroinitializer }, ptr %[[MEM]] -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[DESC]], ptr %[[MEM]], i32 40, i1 false) +// CHECK: %[[LOADED:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[MEM]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOADED]], ptr %[[DESC]] // CHECK: ret void // CHECK: } @@ -65,7 +66,8 @@ func.func @_QMpolymorphic_testPtest_embox() { // CHECK-LABEL: @_QMpolymorphic_testPtest_embox() // CHECK: %[[ALLOCA_DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } { ptr @_QFEy, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, {{.*}}, ptr %[[ALLOCA_DESC]] -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr @_QFEx, ptr %[[ALLOCA_DESC]], i32 64, i1 false) +// CHECK: %[[LOADED_DESC:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA_DESC]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[LOADED_DESC]], ptr @_QFEx, align 8 // Test emboxing of an array element from an unlimited polymorphic array. @@ -156,7 +158,8 @@ func.func @_QQmain() { // CHECK: %[[CLASS_NONE:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } // CHECK: %[[DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr @_QMmod1Ea, i64 ptrtoint (ptr getelementptr (%_QMmod1TtK2, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 1, i8 1, ptr @_QMmod1EXdtXtX2, [1 x i64] zeroinitializer }, ptr %[[CLASS_NONE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[DESC]], ptr %[[CLASS_NONE]], i32 40, i1 false) +// CHECK: %[[LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[CLASS_NONE]] +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD]], ptr %[[DESC]] // CHECK: call void @_QMmod1Psub1(ptr %[[DESC]]) fir.global @_QMmod2Ep : !fir.class> { @@ -177,7 +180,8 @@ func.func private @_FortranAPointerAssociate(!fir.ref>, !fir.box< // CHECK-LABEL: define void @_QMmod2Pinitp( // CHECK-SAME: ptr nocapture %[[ARG0:.*]]){{.*}}{ // CHECK: %[[ALLOCA_CLASS_NONE:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[ALLOCA_CLASS_NONE]], ptr %[[ARG0]], i32 40, i1 false) +// CHECK: %[[LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[ARG0]] +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD]], ptr %[[ALLOCA_CLASS_NONE]] // CHECK: call void @_FortranAPointerAssociate(ptr @_QMmod2Ep, ptr %[[ALLOCA_CLASS_NONE]]) // CHECK: ret void diff --git a/flang/test/Fir/tbaa.fir b/flang/test/Fir/tbaa.fir index 401ebbc8c49fe..809ab3a922a0f 100644 --- a/flang/test/Fir/tbaa.fir +++ b/flang/test/Fir/tbaa.fir @@ -137,8 +137,8 @@ module { // CHECK: %[[VAL_7:.*]] = llvm.mlir.addressof @_QFEx : !llvm.ptr // CHECK: %[[VAL_8:.*]] = llvm.mlir.addressof @_QQclX2E2F64756D6D792E66393000 : !llvm.ptr // CHECK: %[[VAL_10:.*]] = llvm.call @_FortranAioBeginExternalListOutput(%[[VAL_6]], %[[VAL_8]], %[[VAL_5]]) {fastmathFlags = #llvm.fastmath} : (i32, !llvm.ptr, i32) -> !llvm.ptr -// CHECK: %[[VAL_11:.*]] = llvm.mlir.constant(64 : i32) : i32 -// CHECK: "llvm.intr.memcpy"(%[[VAL_3]], %[[VAL_7]], %[[VAL_11]]) <{isVolatile = false, tbaa = [#[[$BOXT]]]}> +// CHECK: %[[VAL_11:.*]] = llvm.load %[[VAL_7]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> +// CHECK: llvm.store %[[VAL_11]], %[[VAL_3]] {tbaa = [#[[$BOXT]]]} : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)>, !llvm.ptr // CHECK: %[[VAL_12:.*]] = llvm.getelementptr %[[VAL_3]][0, 7, %[[VAL_4]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> // CHECK: %[[VAL_13:.*]] = llvm.load %[[VAL_12]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i64 // CHECK: %[[VAL_14:.*]] = llvm.getelementptr %[[VAL_3]][0, 7, %[[VAL_4]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> diff --git a/flang/test/Integration/OpenMP/private-global.f90 b/flang/test/Integration/OpenMP/private-global.f90 index 07dbe86e5ec93..28ba4c9b260b9 100644 --- a/flang/test/Integration/OpenMP/private-global.f90 +++ b/flang/test/Integration/OpenMP/private-global.f90 @@ -1,5 +1,5 @@ !RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s - +!XFAIL: * ! Regression test for https://github.com/llvm/llvm-project/issues/106297 program bug @@ -29,12 +29,17 @@ program bug ! CHECK: %[[TABLE_BOX_ADDR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8 ! CHECK: %[[BOXED_FIFTY:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 ! CHECK: %[[TABLE_BOX_ADDR2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8 -! CHECK: %[[TABLE_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]] {{\[\[}}3 x i64] [i64 1, i64 10, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)]] }, ptr %[[PRIV_TABLE]], 0 -! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL]], ptr %[[TABLE_BOX_ADDR]], align 8 -! CHECK : %[[TABLE_BOX_VAL2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[TABLE_BOX_ADDR]], align 8 -! CHECK : store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL2]], ptr %[[TABLE_BOX_ADDR2]], align 8 -! CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[TABLE_BOX_ADDR2]], ptr %[[TABLE_BOX_ADDR]], i32 48, i1 false) +! CHECK: %[[LOAD_26:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[PRIV_BOX_ALLOC]], align 8 +! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[LOAD_26]], ptr %[[INTERMEDIATE]], align 8 +! CHECK: store i32 50, ptr %[[FIFTY]], align 4 +! CHECK: %[[FIFTY_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 9, i8 0, i8 0 }, ptr %[[FIFTY]], 0 +! CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[FIFTY_BOX_VAL]], ptr %[[BOXED_FIFTY]], align 8 +! CHECK: %[[TABLE_BOX_VAL2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INTERMEDIATE]], align 8 +! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL2]], ptr %[[TABLE_BOX_ADDR2]], align 8 ! CHECK: call void @_FortranAAssign(ptr %[[TABLE_BOX_ADDR2]], ptr %[[BOXED_FIFTY]], ptr @{{.*}}, i32 9) +! CHECK: %[[LOAD_29:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[PRIV_BOX_ALLOC]], align 8 +! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[LOAD_29]], ptr %[[TABLE_BOX_ADDR]], align 8 +! CHECK: %[[PRIV_TABLE:.*]] = call ptr @malloc(i64 ptrtoint (ptr getelementptr ([10 x i32], ptr null, i32 1) to i64)) ! ... ! check that we use the private copy of table for table/=50 ! CHECK: omp.par.region3: diff --git a/flang/test/Lower/Intrinsics/acosh.f90 b/flang/test/Lower/Intrinsics/acosh.f90 index bc0d03f9d1b7f..e7041530b47d3 100644 --- a/flang/test/Lower/Intrinsics/acosh.f90 +++ b/flang/test/Lower/Intrinsics/acosh.f90 @@ -1,9 +1,9 @@ -! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL %s -! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL %s -! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL %s +! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL,FAST %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL,FAST %s +! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL,RELAXED %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL,RELAXED %s +! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL,PRECISE %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL,PRECISE %s function test_real4(x) real :: x, test_real4 @@ -11,7 +11,9 @@ function test_real4(x) end function ! ALL-LABEL: @_QPtest_real4 -! ALL: {{%[A-Za-z0-9._]+}} = fir.call @acoshf({{%[A-Za-z0-9._]+}}) {{.*}}: (f32) -> f32 +! FAST: {{%[A-Za-z0-9._]+}} = math.acosh {{%[A-Za-z0-9._]+}} {{.*}}: f32 +! RELAXED: {{%[A-Za-z0-9._]+}} = math.acosh {{%[A-Za-z0-9._]+}} {{.*}}: f32 +! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @acoshf({{%[A-Za-z0-9._]+}}) {{.*}}: (f32) -> f32 function test_real8(x) real(8) :: x, test_real8 @@ -19,7 +21,9 @@ function test_real8(x) end function ! ALL-LABEL: @_QPtest_real8 -! ALL: {{%[A-Za-z0-9._]+}} = fir.call @acosh({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64 +! FAST: {{%[A-Za-z0-9._]+}} = math.acosh {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! RELAXED: {{%[A-Za-z0-9._]+}} = math.acosh {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @acosh({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64 function test_complex4(x) complex :: x, test_complex4 diff --git a/flang/test/Lower/Intrinsics/asin.f90 b/flang/test/Lower/Intrinsics/asin.f90 index a2ff6d6dab757..73ca8c5a76ee5 100644 --- a/flang/test/Lower/Intrinsics/asin.f90 +++ b/flang/test/Lower/Intrinsics/asin.f90 @@ -1,9 +1,9 @@ -! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL %s -! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL %s -! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL %s +! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL,FAST %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL,FAST %s +! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL,RELAXED %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL,RELAXED %s +! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL,PRECISE %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL,PRECISE %s function test_real4(x) real :: x, test_real4 @@ -11,7 +11,9 @@ function test_real4(x) end function ! ALL-LABEL: @_QPtest_real4 -! ALL: {{%[A-Za-z0-9._]+}} = fir.call @asinf({{%[A-Za-z0-9._]+}}) {{.*}}: (f32) -> f32 +! FAST: {{%[A-Za-z0-9._]+}} = math.asin {{%[A-Za-z0-9._]+}} {{.*}}: f32 +! RELAXED: {{%[A-Za-z0-9._]+}} = math.asin {{%[A-Za-z0-9._]+}} {{.*}}: f32 +! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @asinf({{%[A-Za-z0-9._]+}}) {{.*}}: (f32) -> f32 function test_real8(x) real(8) :: x, test_real8 @@ -19,7 +21,9 @@ function test_real8(x) end function ! ALL-LABEL: @_QPtest_real8 -! ALL: {{%[A-Za-z0-9._]+}} = fir.call @asin({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64 +! FAST: {{%[A-Za-z0-9._]+}} = math.asin {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! RELAXED: {{%[A-Za-z0-9._]+}} = math.asin {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @asin({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64 function test_complex4(x) complex :: x, test_complex4 diff --git a/flang/test/Lower/Intrinsics/asinh.f90 b/flang/test/Lower/Intrinsics/asinh.f90 index 193a8ac7a3d9e..aab666ac3ea3f 100644 --- a/flang/test/Lower/Intrinsics/asinh.f90 +++ b/flang/test/Lower/Intrinsics/asinh.f90 @@ -1,9 +1,9 @@ -! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL %s -! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL %s -! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL %s +! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL,FAST %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL,FAST %s +! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL,RELAXED %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL,RELAXED %s +! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL,PRECISE %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL,PRECISE %s function test_real4(x) real :: x, test_real4 @@ -11,7 +11,9 @@ function test_real4(x) end function ! ALL-LABEL: @_QPtest_real4 -! ALL: {{%[A-Za-z0-9._]+}} = fir.call @asinhf({{%[A-Za-z0-9._]+}}) {{.*}}: (f32) -> f32 +! FAST: {{%[A-Za-z0-9._]+}} = math.asinh {{%[A-Za-z0-9._]+}} {{.*}}: f32 +! RELAXED: {{%[A-Za-z0-9._]+}} = math.asinh {{%[A-Za-z0-9._]+}} {{.*}}: f32 +! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @asinhf({{%[A-Za-z0-9._]+}}) {{.*}}: (f32) -> f32 function test_real8(x) real(8) :: x, test_real8 @@ -19,7 +21,10 @@ function test_real8(x) end function ! ALL-LABEL: @_QPtest_real8 -! ALL: {{%[A-Za-z0-9._]+}} = fir.call @asinh({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64 +! FAST: {{%[A-Za-z0-9._]+}} = math.asinh {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! RELAXED: {{%[A-Za-z0-9._]+}} = math.asinh {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @asinh({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64 + function test_complex4(x) complex :: x, test_complex4 diff --git a/flang/test/Lower/Intrinsics/atanh.f90 b/flang/test/Lower/Intrinsics/atanh.f90 index a23b64f3414c4..255d95ff29c01 100644 --- a/flang/test/Lower/Intrinsics/atanh.f90 +++ b/flang/test/Lower/Intrinsics/atanh.f90 @@ -1,9 +1,9 @@ -! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL %s -! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL %s -! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL %s -! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL %s +! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL,FAST %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL,FAST %s +! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL,RELAXED %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL,RELAXED %s +! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL,PRECISE %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL,PRECISE %s function test_real4(x) real :: x, test_real4 @@ -11,7 +11,9 @@ function test_real4(x) end function ! ALL-LABEL: @_QPtest_real4 -! ALL: {{%[A-Za-z0-9._]+}} = fir.call @atanhf({{%[A-Za-z0-9._]+}}) {{.*}}: (f32) -> f32 +! FAST: {{%[A-Za-z0-9._]+}} = math.atanh {{%[A-Za-z0-9._]+}} {{.*}}: f32 +! RELAXED: {{%[A-Za-z0-9._]+}} = math.atanh {{%[A-Za-z0-9._]+}} {{.*}}: f32 +! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @atanhf({{%[A-Za-z0-9._]+}}) {{.*}}: (f32) -> f32 function test_real8(x) real(8) :: x, test_real8 @@ -19,7 +21,9 @@ function test_real8(x) end function ! ALL-LABEL: @_QPtest_real8 -! ALL: {{%[A-Za-z0-9._]+}} = fir.call @atanh({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64 +! FAST: {{%[A-Za-z0-9._]+}} = math.atanh {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! RELAXED: {{%[A-Za-z0-9._]+}} = math.atanh {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @atanh({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64 function test_complex4(x) complex :: x, test_complex4 diff --git a/flang/test/Lower/OpenMP/declare-target-func-and-subr.f90 b/flang/test/Lower/OpenMP/declare-target-func-and-subr.f90 index db8320a598052..1c43f1d09eddb 100644 --- a/flang/test/Lower/OpenMP/declare-target-func-and-subr.f90 +++ b/flang/test/Lower/OpenMP/declare-target-func-and-subr.f90 @@ -85,6 +85,13 @@ FUNCTION FUNC_DEFAULT_EXTENDEDLIST() RESULT(I) I = 1 END FUNCTION FUNC_DEFAULT_EXTENDEDLIST +! ALL-LABEL: func.func @_QPfunc_name_as_result() +! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget{{.*}} +FUNCTION FUNC_NAME_AS_RESULT() +!$omp declare target(FUNC_NAME_AS_RESULT) + FUNC_NAME_AS_RESULT = 1.0 +END FUNCTION FUNC_NAME_AS_RESULT + !! ----- ! Check specification valid forms of declare target with subroutines diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 index b3a668018df1d..9c97c689dad70 100644 --- a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 +++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 @@ -57,4 +57,5 @@ end program compilation_to_obj ! LLVM: @[[GLOB_VAR:[^[:space:]]+]]t = internal global ! LLVM: define internal void @_QQmain..omp_par -! LLVM: call void @llvm.memcpy.p0.p0.i32(ptr %{{.+}}, ptr @[[GLOB_VAR]]t, i32 48, i1 false) +! LLVM: %[[GLOB_VAL:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr @[[GLOB_VAR]]t, align 8 +! LLVM-NEXT: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[GLOB_VAL]], ptr %{{.*}}, align 8 diff --git a/flang/test/Lower/OpenMP/map-character.f90 b/flang/test/Lower/OpenMP/map-character.f90 new file mode 100644 index 0000000000000..232c0a6361cb6 --- /dev/null +++ b/flang/test/Lower/OpenMP/map-character.f90 @@ -0,0 +1,47 @@ +! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s + +subroutine TestOfCharacter(a0, a1, l) + character(len=*), intent(in) :: a0 + character(len=*), intent(inout):: a1 + integer, intent(in) :: l + + !$omp target map(to:a0) map(from: a1) + a1 = a0 + !$omp end target +end subroutine TestOfCharacter + + +!CHECK: func.func @_QPtestofcharacter(%[[ARG0:.*]]: !fir.boxchar<1> {{.*}}, %[[ARG1:.*]]: !fir.boxchar<1> {{.*}} +!CHECK: %[[A0_BOXCHAR_ALLOCA:.*]] = fir.alloca !fir.boxchar<1> +!CHECK: %[[A1_BOXCHAR_ALLOCA:.*]] = fir.alloca !fir.boxchar<1> +!CHECK: %[[UNBOXED_ARG0:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +!CHECK: %[[A0_DECL:.*]]:2 = hlfir.declare %[[UNBOXED_ARG0]]#0 typeparams %[[UNBOXED_ARG0]]#1 dummy_scope {{.*}} -> (!fir.boxchar<1>, !fir.ref>) +!CHECK: %[[UNBOXED_ARG1:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +!CHECK: %[[A1_DECL:.*]]:2 = hlfir.declare %[[UNBOXED_ARG1]]#0 typeparams %[[UNBOXED_ARG1]]#1 dummy_scope {{.*}} -> (!fir.boxchar<1>, !fir.ref>) +!CHECK: %[[UNBOXED_A0_DECL:.*]]:2 = fir.unboxchar %[[A0_DECL]]#0 : (!fir.boxchar<1>) -> (!fir.ref>, index) +!CHECK: %[[A0_LB:.*]] = arith.constant 0 : index +!CHECK: %[[A0_STRIDE:.*]] = arith.constant 1 : index +!CHECK: %[[A0_UB:.*]] = arith.subi %[[UNBOXED_A0_DECL]]#1, %[[A0_STRIDE]] : index +!CHECK: %[[A0_BOUNDS:.*]] = omp.map.bounds lower_bound(%[[A0_LB]] : index) upper_bound(%[[A0_UB]] : index) extent(%[[UNBOXED_A0_DECL]]#1 : index) +!CHECK-SAME: stride(%[[A0_STRIDE]] : index) start_idx(%[[A0_LB]] : index) {stride_in_bytes = true} +!CHECK: %[[A0_MAP:.*]] = omp.map.info var_ptr(%[[A0_DECL]]#1 : !fir.ref>, !fir.char<1,?>) map_clauses(to) capture(ByRef) bounds(%[[A0_BOUNDS]]) -> !fir.ref> {name = "a0"} +!CHECK: %[[UNBOXED_A1_DECL:.*]]:2 = fir.unboxchar %[[A1_DECL]]#0 : (!fir.boxchar<1>) -> (!fir.ref>, index) +!CHECK: %[[A1_LB:.*]] = arith.constant 0 : index +!CHECK: %[[A1_STRIDE:.*]] = arith.constant 1 : index +!CHECK: %[[A1_UB:.*]] = arith.subi %[[UNBOXED_A1_DECL]]#1, %[[A1_STRIDE]] : index +!CHECK: %[[A1_BOUNDS:.*]] = omp.map.bounds lower_bound(%[[A1_LB]] : index) upper_bound(%[[A1_UB]] : index) extent(%[[UNBOXED_A1_DECL]]#1 : index) +!CHECKL-SAME: stride(%[[A1_STRIDE]] : index) start_idx(%[[A1_LB]] : index) {stride_in_bytes = true} +!CHECK: %[[A1_MAP:.*]] = omp.map.info var_ptr(%[[A1_DECL]]#1 : !fir.ref>, !fir.char<1,?>) map_clauses(from) capture(ByRef) bounds(%[[A1_BOUNDS]]) -> !fir.ref> {name = "a1"} +!CHECK: fir.store %[[ARG1]] to %[[A1_BOXCHAR_ALLOCA]] : !fir.ref> +!CHECK: %[[A1_BOXCHAR_MAP:.*]] = omp.map.info var_ptr(%[[A1_BOXCHAR_ALLOCA]] : !fir.ref>, !fir.boxchar<1>) map_clauses(implicit, to) capture(ByRef) -> !fir.ref> {name = ""} +!CHECK: fir.store %[[ARG0]] to %[[A0_BOXCHAR_ALLOCA]] : !fir.ref> +!CHECK: %[[A0_BOXCHAR_MAP:.*]] = omp.map.info var_ptr(%[[A0_BOXCHAR_ALLOCA]] : !fir.ref>, !fir.boxchar<1>) map_clauses(implicit, to) capture(ByRef) -> !fir.ref> {name = ""} + +!CHECK: omp.target map_entries(%[[A0_MAP]] -> %[[TGT_A0:.*]], %[[A1_MAP]] -> %[[TGT_A1:.*]], %[[A1_BOXCHAR_MAP]] -> %[[TGT_A1_BOXCHAR:.*]], %[[A0_BOXCHAR_MAP]] -> %[[TGT_A0_BOXCHAR:.*]] : !fir.ref>, !fir.ref>, !fir.ref>, !fir.ref>) { +!CHECK: %[[TGT_A0_BC_LD:.*]] = fir.load %[[TGT_A0_BOXCHAR]] : !fir.ref> +!CHECK: %[[TGT_A1_BC_LD:.*]] = fir.load %[[TGT_A1_BOXCHAR]] : !fir.ref> +!CHECK: %[[UNBOXED_TGT_A1:.*]]:2 = fir.unboxchar %[[TGT_A1_BC_LD]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +!CHECK: %[[UNBOXED_TGT_A0:.*]]:2 = fir.unboxchar %[[TGT_A0_BC_LD]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +!CHECK: %[[TGT_A0_DECL:.*]]:2 = hlfir.declare %[[TGT_A0]] typeparams %[[UNBOXED_TGT_A0]]#1 {{.*}} -> (!fir.boxchar<1>, !fir.ref>) +!CHECK: %[[TGT_A1_DECL:.*]]:2 = hlfir.declare %[[TGT_A1]] typeparams %[[UNBOXED_TGT_A1]]#1 {{.*}} -> (!fir.boxchar<1>, !fir.ref>) + diff --git a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 index b3e25ae779561..254544265dd6d 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 @@ -17,7 +17,7 @@ subroutine proc end subroutine proc !CHECK-LABEL: define void @proc_() -!CHECK: call void (ptr, i32, ptr, ...) +!CHECK: call void !CHECK-SAME: @__kmpc_fork_call(ptr {{.*}}, i32 1, ptr @[[OMP_PAR:.*]], {{.*}}) !CHECK: define internal void @[[OMP_PAR]](ptr {{.*}} %[[TID_ADDR:.*]], ptr noalias diff --git a/flang/test/Lower/OpenMP/target_cpu_features.f90 b/flang/test/Lower/OpenMP/target_cpu_features.f90 index ea8efcf5d256b..4532593156eab 100644 --- a/flang/test/Lower/OpenMP/target_cpu_features.f90 +++ b/flang/test/Lower/OpenMP/target_cpu_features.f90 @@ -12,7 +12,7 @@ !AMDGCN-SAME: "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot3-insts", !AMDGCN-SAME: "+dot4-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", !AMDGCN-SAME: "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+mai-insts", -!AMDGCN-SAME: "+s-memrealtime", "+s-memtime-inst", "+wavefrontsize64"]> +!AMDGCN-SAME: "+s-memrealtime", "+s-memtime-inst", "+vmem-to-lds-load-insts", "+wavefrontsize64"]> !NVPTX: module attributes { !NVPTX-SAME: fir.target_cpu = "sm_80" diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90 index db518c541918a..c32af3c2f54b5 100644 --- a/flang/test/Lower/allocatable-polymorphic.f90 +++ b/flang/test/Lower/allocatable-polymorphic.f90 @@ -603,9 +603,10 @@ program test_alloc ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 1, i32 0) ! LLVM: call void @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 20) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) -! LLVM-COUNT-2: call void %{{[0-9]*}}() +! LLVM-COUNT-2: call void %{{.*}}() -! LLVM: call void @llvm.memcpy.p0.p0.i32 +! LLVM: %[[C1_LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}} +! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[C1_LOAD]], ptr %{{.*}} ! LLVM: %[[GEP_TDESC_C1:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 7 ! LLVM: %[[TDESC_C1:.*]] = load ptr, ptr %[[GEP_TDESC_C1]] ! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1 @@ -619,7 +620,8 @@ program test_alloc ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %{{.*}}, ptr %[[TMP:.*]] ! LLVM: call void %{{.*}}(ptr %{{.*}}) -! LLVM: call void @llvm.memcpy.p0.p0.i32 +! LLVM: %[[LOAD_C2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}} +! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD_C2]], ptr %{{.*}} ! LLVM: %[[GEP_TDESC_C2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 7 ! LLVM: %[[TDESC_C2:.*]] = load ptr, ptr %[[GEP_TDESC_C2]] ! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1 @@ -633,7 +635,9 @@ program test_alloc ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %{{.*}}, ptr %{{.*}} ! LLVM: call void %{{.*}}(ptr %{{.*}}) -! LLVM: call void @llvm.memcpy.p0.p0.i32 +! LLVM: %[[C3_LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}} +! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[C3_LOAD]], ptr %{{.*}} + ! LLVM: %[[GEP_TDESC_C3:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 8 ! LLVM: %[[TDESC_C3:.*]] = load ptr, ptr %[[GEP_TDESC_C3]] ! LLVM: %[[ELE_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1 @@ -654,7 +658,8 @@ program test_alloc ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX7]], ptr %{{.*}} ! LLVM: call void %{{.*}}(ptr %{{.*}}) -! LLVM: call void @llvm.memcpy.p0.p0.i32 +! LLVM: %[[C4_LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}} +! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[C4_LOAD]], ptr %{{.*}} ! LLVM: %[[GEP_TDESC_C4:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 8 ! LLVM: %[[TDESC_C4:.*]] = load ptr, ptr %[[GEP_TDESC_C4]] ! LLVM: %[[ELE_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1 @@ -681,7 +686,8 @@ program test_alloc ! LLVM-LABEL: define void @_QMpolyPtest_deallocate() ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 ptrtoint (ptr getelementptr (%_QMpolyTp1, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QMpolyEXdtXp1, [1 x i64] zeroinitializer }, ptr %[[ALLOCA1:[0-9]*]] -! LLVM: call void @llvm.memcpy.p0.p0.i32(ptr %[[ALLOCA2:[0-9]+]], ptr %[[ALLOCA1]], i32 40, i1 false) +! LLVM: %[[LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[ALLOCA1]] +! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD]], ptr %[[ALLOCA2:[0-9]*]] ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %[[ALLOCA2]], ptr @_QMpolyEXdtXp1, i32 0, i32 0) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %[[ALLOCA2]], i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableDeallocatePolymorphic(ptr %[[ALLOCA2]], ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) diff --git a/flang/test/Semantics/OpenMP/declare-target-function-name-with-symbols.f90 b/flang/test/Semantics/OpenMP/declare-target-function-name-with-symbols.f90 new file mode 100644 index 0000000000000..9a0acdb3dd100 --- /dev/null +++ b/flang/test/Semantics/OpenMP/declare-target-function-name-with-symbols.f90 @@ -0,0 +1,34 @@ +!RUN: %flang_fc1 -fdebug-unparse-with-symbols -fopenmp %s 2>&1 | FileCheck %s + +! This used to crash. + +module test + contains + function ex(a, b, c) + !$omp declare target(ex) + integer :: a, b, c + ex = a + b + c + end function ex +end module test + +!CHECK: !DEF: /test Module +!CHECK: module test +!CHECK: contains +!CHECK: !DEF: /test/ex PUBLIC (Function, OmpDeclareTarget) Subprogram REAL(4) +!CHECK: !DEF: /test/ex/a ObjectEntity INTEGER(4) +!CHECK: !DEF: /test/ex/b ObjectEntity INTEGER(4) +!CHECK: !DEF: /test/ex/c ObjectEntity INTEGER(4) +!CHECK: function ex(a, b, c) +!CHECK: !$omp declare target (ex) +!CHECK: !REF: /test/ex/a +!CHECK: !REF: /test/ex/b +!CHECK: !REF: /test/ex/c +!CHECK: integer a, b, c +!CHECK: !DEF: /test/ex/ex (Implicit, OmpDeclareTarget) ObjectEntity REAL(4) +!CHECK: !REF: /test/ex/a +!CHECK: !REF: /test/ex/b +!CHECK: !REF: /test/ex/c +!CHECK: ex = a+b+c +!CHECK: end function ex +!CHECK: end module test + diff --git a/flang/test/Semantics/OpenMP/nested-target.f90 b/flang/test/Semantics/OpenMP/nested-target.f90 index f42b5dde6a08d..6a56a84f4f570 100644 --- a/flang/test/Semantics/OpenMP/nested-target.f90 +++ b/flang/test/Semantics/OpenMP/nested-target.f90 @@ -54,6 +54,7 @@ program main n2 = 10 !$omp target teams map(to:a) !PORTABILITY: If TARGET DATA directive is nested inside TARGET region, the behaviour is unspecified + !ERROR: Only `DISTRIBUTE`, `PARALLEL`, or `LOOP` regions are allowed to be strictly nested inside `TEAMS` region. !$omp target data map(n1,n2) do i=1, n1 do j=1, n2 @@ -65,6 +66,7 @@ program main !$omp target teams map(to:a) map(from:n1,n2) !PORTABILITY: If TARGET TEAMS DISTRIBUTE PARALLEL DO directive is nested inside TARGET region, the behaviour is unspecified + !ERROR: Only `DISTRIBUTE`, `PARALLEL`, or `LOOP` regions are allowed to be strictly nested inside `TEAMS` region. !$omp target teams distribute parallel do do i=1, n1 do j=1, n2 diff --git a/flang/test/Semantics/OpenMP/nested-teams.f90 b/flang/test/Semantics/OpenMP/nested-teams.f90 index b1a7c92a6906b..974172ee97175 100644 --- a/flang/test/Semantics/OpenMP/nested-teams.f90 +++ b/flang/test/Semantics/OpenMP/nested-teams.f90 @@ -68,6 +68,7 @@ program main !$omp end target !$omp target teams + !ERROR: Only `DISTRIBUTE`, `PARALLEL`, or `LOOP` regions are allowed to be strictly nested inside `TEAMS` region. !ERROR: TEAMS region can only be strictly nested within the implicit parallel region or TARGET region !$omp teams a = 3.14 diff --git a/flang/tools/flang-driver/driver.cpp b/flang/tools/flang-driver/driver.cpp index 52136df10c0b0..74f35d8b795aa 100644 --- a/flang/tools/flang-driver/driver.cpp +++ b/flang/tools/flang-driver/driver.cpp @@ -110,6 +110,13 @@ int main(int argc, const char **argv) { } } + llvm::StringSet<> savedStrings; + // Handle FCC_OVERRIDE_OPTIONS, used for editing a command line behind the + // scenes. + if (const char *overrideStr = ::getenv("FCC_OVERRIDE_OPTIONS")) + clang::driver::applyOverrideOptions(args, overrideStr, savedStrings, + "FCC_OVERRIDE_OPTIONS", &llvm::errs()); + // Not in the frontend mode - continue in the compiler driver mode. // Create DiagnosticsEngine for the compiler driver diff --git a/lldb/source/Core/DataFileCache.cpp b/lldb/source/Core/DataFileCache.cpp index ef0e07a8b0342..9109269711231 100644 --- a/lldb/source/Core/DataFileCache.cpp +++ b/lldb/source/Core/DataFileCache.cpp @@ -132,6 +132,11 @@ bool DataFileCache::SetCachedData(llvm::StringRef key, if (file_or_err) { llvm::CachedFileStream *cfs = file_or_err->get(); cfs->OS->write((const char *)data.data(), data.size()); + if (llvm::Error err = cfs->commit()) { + Log *log = GetLog(LLDBLog::Modules); + LLDB_LOG_ERROR(log, std::move(err), + "failed to commit to the cache for key: {0}"); + } return true; } else { Log *log = GetLog(LLDBLog::Modules); diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 75876c514c366..7db45f13e03b0 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1710,6 +1710,20 @@ The AMDGPU backend supports the following LLVM IR attributes. as hidden. Hidden arguments are managed by the compiler and are not part of the explicit arguments supplied by the user. + "amdgpu-sgpr-hazard-wait" Disabled SGPR hazard wait insertion if set to 0. + Exists for testing performance impact of SGPR hazard waits only. + + "amdgpu-sgpr-hazard-boundary-cull" Enable insertion of SGPR hazard cull sequences at function call boundaries. + Cull sequence reduces future hazard waits, but has a performance cost. + + "amdgpu-sgpr-hazard-mem-wait-cull" Enable insertion of SGPR hazard cull sequences before memory waits. + Cull sequence reduces future hazard waits, but has a performance cost. + Attempt to amortize cost by overlapping with memory accesses. + + "amdgpu-sgpr-hazard-mem-wait-cull-threshold" + Sets the number of active SGPR hazards that must be present before + inserting a cull sequence at a memory wait. + ======================================= ========================================================== Calling Conventions @@ -18156,7 +18170,7 @@ terminated by an ``.end_amdhsa_kernel`` directive. (cumode) ``.amdhsa_memory_ordered`` 1 GFX10-GFX12 Controls MEM_ORDERED in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. - ``.amdhsa_forward_progress`` 0 GFX10-GFX12 Controls FWD_PROGRESS in + ``.amdhsa_forward_progress`` 1 GFX10-GFX12 Controls FWD_PROGRESS in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_shared_vgpr_count`` 0 GFX10-GFX11 Controls SHARED_VGPR_COUNT in :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx11-table`. diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst index 8dc1357635e1b..115e3a4c4ce38 100644 --- a/llvm/docs/CommandGuide/llvm-objcopy.rst +++ b/llvm/docs/CommandGuide/llvm-objcopy.rst @@ -73,6 +73,12 @@ multiple file formats. For MachO objects, ``
`` must be formatted as ``,
``. +.. option:: --dump-offload-bundle= + + Dump the HIP Offload Bundle entry specified by the URI syntax given, into a + code object file. + + .. option:: --enable-deterministic-archives, -D Enable deterministic mode when copying archives, i.e. use 0 for archive member diff --git a/llvm/docs/CommandGuide/llvm-readobj.rst b/llvm/docs/CommandGuide/llvm-readobj.rst index 8bd29eafbbfcf..faaddb4699f7d 100644 --- a/llvm/docs/CommandGuide/llvm-readobj.rst +++ b/llvm/docs/CommandGuide/llvm-readobj.rst @@ -104,6 +104,10 @@ file formats. Do not demangle symbol names in the output. This option is only for ELF and XCOFF file formats. The option is enabled by default. +.. option:: --offloading + + Display list of HIP Offload bundles using URI syntax. + .. option:: --relocations, --relocs, -r Display the relocation entries in the file. diff --git a/llvm/include/llvm/Analysis/ScopedNoAliasAA.h b/llvm/include/llvm/Analysis/ScopedNoAliasAA.h index f6ade7c83a61a..96afe3ce6ecdf 100644 --- a/llvm/include/llvm/Analysis/ScopedNoAliasAA.h +++ b/llvm/include/llvm/Analysis/ScopedNoAliasAA.h @@ -43,6 +43,9 @@ class ScopedNoAliasAAResult : public AAResultBase { ModRefInfo getModRefInfo(const CallBase *Call1, const CallBase *Call2, AAQueryInfo &AAQI); + void collectScopedDomains(const MDNode *NoAlias, + SmallPtrSetImpl &Domains) const; + private: bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias) const; }; diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 596db39239213..92ad26df0cb35 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -465,7 +465,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { bool useAA() const { return getST()->useAA(); } - bool isTypeLegal(Type *Ty) { + bool isTypeLegal(Type *Ty) const { EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true); return getTLI()->isTypeLegal(VT); } diff --git a/llvm/include/llvm/CodeGen/TargetSchedule.h b/llvm/include/llvm/CodeGen/TargetSchedule.h index bfe4234abf8eb..69108a15e6cfe 100644 --- a/llvm/include/llvm/CodeGen/TargetSchedule.h +++ b/llvm/include/llvm/CodeGen/TargetSchedule.h @@ -45,6 +45,16 @@ class TargetSchedModel { unsigned computeInstrLatency(const MCSchedClassDesc &SCDesc) const; + // EnableSchedModel and EnableSchedItins are used to control whether or not to + // use the Target's {SchedMachineModel, InstrItins} for hardware infor based + // Scheduling decisions. If both are enabled, as is the default, preference + // will be given to one based on the API implementation. By disabling one, we + // can force preference of the other. By disabling both, we will throw away + // any target specific hardware details for scheduling decisions, and fall + // into things that provide generic info such as defaultDefLatency. + bool EnableSchedModel = true; + bool EnableSchedItins = true; + public: TargetSchedModel() : SchedModel(MCSchedModel::Default) {} @@ -53,7 +63,8 @@ class TargetSchedModel { /// The machine model API keeps a copy of the top-level MCSchedModel table /// indices and may query TargetSubtargetInfo and TargetInstrInfo to resolve /// dynamic properties. - void init(const TargetSubtargetInfo *TSInfo); + void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel = true, + bool EnableSItins = true); /// Return the MCSchedClassDesc for this instruction. const MCSchedClassDesc *resolveSchedClass(const MachineInstr *MI) const; diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 1ab5db5da22ea..8e4b119414dde 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -40,9 +40,10 @@ class OpenMPIRBuilder; /// not have any PHINodes. If \p CreateBranch is true, a branch instruction to /// \p New will be added such that there is no semantic change. Otherwise, the /// \p IP insert block remains degenerate and it is up to the caller to insert a -/// terminator. -void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, - bool CreateBranch); +/// terminator. \p DL is used as the debug location for the branch instruction +/// if one is created. +void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, + DebugLoc DL); /// Splice a BasicBlock at an IRBuilder's current insertion point. Its new /// insert location will stick to after the instruction before the insertion @@ -58,9 +59,10 @@ void spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch); /// is true, a branch to the new successor will new created such that /// semantically there is no change; otherwise the block of the insertion point /// remains degenerate and it is the caller's responsibility to insert a -/// terminator. Returns the new successor block. +/// terminator. \p DL is used as the debug location for the branch instruction +/// if one is created. Returns the new successor block. BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, - llvm::Twine Name = {}); + DebugLoc DL, llvm::Twine Name = {}); /// Split a BasicBlock at \p Builder's insertion point, even if the block is /// degenerate (missing the terminator). Its new insert location will stick to diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h index 0bfc7995e5bd8..141c11d3a763e 100644 --- a/llvm/include/llvm/IR/DebugInfoMetadata.h +++ b/llvm/include/llvm/IR/DebugInfoMetadata.h @@ -3077,14 +3077,14 @@ template class DIExprConstVisitor { } std::optional getType(DIOp::BitOffset Op, ArrayRef Ins) { - if (!Ins[0].ResultType->isIntegerTy()) + if (!Ins[1].ResultType->isIntegerTy()) return getTypeError( "DIOpBitOffset requires first input be integer typed"); return Op.getResultType(); } std::optional getType(DIOp::ByteOffset Op, ArrayRef Ins) { - if (!Ins[0].ResultType->isIntegerTy()) + if (!Ins[1].ResultType->isIntegerTy()) return getTypeError( "DIOpByteOffset requires first input be integer typed"); return Op.getResultType(); @@ -3107,13 +3107,13 @@ template class DIExprConstVisitor { } std::optional getType(DIOp::Select Op, ArrayRef Ins) { - if (Ins[0].ResultType != Ins[1].ResultType) + if (Ins[1].ResultType != Ins[2].ResultType) return getTypeError( "DIOpSelect requires first two inputs have same type"); - if (!Ins[0].ResultType->isVectorTy()) + if (!Ins[1].ResultType->isVectorTy()) return getTypeError( "DIOpSelect requires first two inputs to be vector typed"); - return Ins[0].ResultType; + return Ins[1].ResultType; } std::optional getType(DIOp::AddrOf Op, ArrayRef) { diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index eab042da5e118..ac0eb4f7a3358 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -549,6 +549,7 @@ def llvm_v3i32_ty : LLVMType; // 3 x i32 def llvm_v4i32_ty : LLVMType; // 4 x i32 def llvm_v6i32_ty : LLVMType; // 6 x i32 def llvm_v8i32_ty : LLVMType; // 8 x i32 +def llvm_v10i32_ty : LLVMType; // 10 x i32 def llvm_v16i32_ty : LLVMType; // 16 x i32 def llvm_v32i32_ty : LLVMType; // 32 x i32 def llvm_v64i32_ty : LLVMType; // 64 x i32 @@ -578,6 +579,7 @@ def llvm_v2f32_ty : LLVMType; // 2 x float def llvm_v3f32_ty : LLVMType; // 3 x float def llvm_v4f32_ty : LLVMType; // 4 x float def llvm_v8f32_ty : LLVMType; // 8 x float +def llvm_v10f32_ty : LLVMType; // 10 x float def llvm_v16f32_ty : LLVMType; // 16 x float def llvm_v32f32_ty : LLVMType; // 32 x float def llvm_v1f64_ty : LLVMType; // 1 x double diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index cc3584833202b..93c8b7142d174 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1867,7 +1867,9 @@ class AMDGPURawBufferLoadLDS : Intrinsic < ImmArg>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; -class AMDGPURawPtrBufferLoadLDS : Intrinsic < +class AMDGPURawPtrBufferLoadLDS : + ClangBuiltin<"__builtin_amdgcn_raw_ptr_buffer_load_lds">, + Intrinsic < [], [AMDGPUBufferRsrcTy, // rsrc(SGPR) LLVMQualPointerType<3>, // LDS base offset @@ -2718,18 +2720,21 @@ def int_amdgcn_ds_sub_gs_reg_rtn : [ImmArg>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; -def int_amdgcn_ds_bvh_stack_rtn : +class IntDSBVHStackRtn : Intrinsic< - [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr + [vdst, llvm_i32_ty], // %vdst, %addr [ llvm_i32_ty, // %addr llvm_i32_ty, // %data0 - llvm_v4i32_ty, // %data1 + data1, // %data1 llvm_i32_ty, // %offset ], [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree] >; +def int_amdgcn_ds_bvh_stack_rtn : IntDSBVHStackRtn; + def int_amdgcn_s_wait_event_export_ready : ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] @@ -2805,6 +2810,37 @@ def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; + +def int_amdgcn_ds_bvh_stack_push8_pop1_rtn : IntDSBVHStackRtn; + +def int_amdgcn_ds_bvh_stack_push8_pop2_rtn : IntDSBVHStackRtn; + +// , , +// llvm.amdgcn.image.bvh.dual.intersect.ray , , +// , , +// , , +// +def int_amdgcn_image_bvh_dual_intersect_ray : + Intrinsic<[llvm_v10i32_ty, llvm_v3f32_ty, llvm_v3f32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_i8_ty, llvm_v3f32_ty, + llvm_v3f32_ty, llvm_v2i32_ty, llvm_v4i32_ty], + [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +// , , +// llvm.amdgcn.image.bvh8.intersect.ray , , +// , , +// , , +// +def int_amdgcn_image_bvh8_intersect_ray : + Intrinsic<[llvm_v10i32_ty, llvm_v3f32_ty, llvm_v3f32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_i8_ty, llvm_v3f32_ty, + llvm_v3f32_ty, llvm_i32_ty, llvm_v4i32_ty], + [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + // llvm.amdgcn.permlane16.var def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">, Intrinsic<[llvm_i32_ty], @@ -2892,7 +2928,7 @@ class AMDGPULoadIntrinsic: Intrinsic< [llvm_any_ty], [ptr_ty], - [IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture>, IntrNoCallback, IntrNoFree], + [IntrReadMem, IntrArgMemOnly, IntrWillReturn, IntrConvergent, NoCapture>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand] >; @@ -3339,6 +3375,12 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">, [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; +// llvm.amdgcn.cvt.off.fp32.i4 int srcA +def int_amdgcn_cvt_off_f32_i4: ClangBuiltin<"__builtin_amdgcn_cvt_off_f32_i4">, + DefaultAttrsIntrinsic<[llvm_float_ty], + [llvm_i32_ty], + [IntrNoMem, IntrSpeculatable]>; + //===----------------------------------------------------------------------===// // gfx950 intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/ObjCopy/CommonConfig.h b/llvm/include/llvm/ObjCopy/CommonConfig.h index aea9cd6f9a9c7..61acf26c1fd9c 100644 --- a/llvm/include/llvm/ObjCopy/CommonConfig.h +++ b/llvm/include/llvm/ObjCopy/CommonConfig.h @@ -276,6 +276,8 @@ struct CommonConfig { bool StripUnneeded = false; bool Weaken = false; bool DecompressDebugSections = false; + bool DumpOffloadBundle = false; + bool NeedPositional = true; DebugCompressionType CompressionType = DebugCompressionType::None; diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h index 4e880fe9dbe82..9a6bf95b6fd84 100644 --- a/llvm/include/llvm/Object/OffloadBinary.h +++ b/llvm/include/llvm/Object/OffloadBinary.h @@ -51,31 +51,6 @@ enum ImageKind : uint16_t { IMG_LAST, }; -class CompressedOffloadBundle { -private: - static inline const size_t MagicSize = 4; - static inline const size_t VersionFieldSize = sizeof(uint16_t); - static inline const size_t MethodFieldSize = sizeof(uint16_t); - static inline const size_t FileSizeFieldSize = sizeof(uint32_t); - static inline const size_t UncompressedSizeFieldSize = sizeof(uint32_t); - static inline const size_t HashFieldSize = sizeof(uint64_t); - static inline const size_t V1HeaderSize = - MagicSize + VersionFieldSize + MethodFieldSize + - UncompressedSizeFieldSize + HashFieldSize; - static inline const size_t V2HeaderSize = - MagicSize + VersionFieldSize + FileSizeFieldSize + MethodFieldSize + - UncompressedSizeFieldSize + HashFieldSize; - static inline const llvm::StringRef MagicNumber = "CCOB"; - static inline const uint16_t Version = 2; - -public: - static llvm::Expected> - compress(llvm::compression::Params P, const llvm::MemoryBuffer &Input, - bool Verbose = false); - static llvm::Expected> - decompress(llvm::MemoryBufferRef &Input, bool Verbose = false); -}; - /// A simple binary serialization of an offloading file. We use this format to /// embed the offloading image into the host executable so it can be extracted /// and used by the linker. @@ -210,160 +185,11 @@ class OffloadFile : public OwningBinary { } }; -/// Bundle entry in binary clang-offload-bundler format. -struct OffloadBundleEntry { - uint64_t Offset = 0u; - uint64_t Size = 0u; - uint64_t IDLength = 0u; - StringRef ID; - OffloadBundleEntry(uint64_t O, uint64_t S, uint64_t I, StringRef T) - : Offset(O), Size(S), IDLength(I), ID(T) {} - void dumpInfo(raw_ostream &OS) { - OS << "Offset = " << Offset << ", Size = " << Size - << ", ID Length = " << IDLength << ", ID = " << ID; - } - void dumpURI(raw_ostream &OS, StringRef filePath) { - OS << ID.data() << "\tfile:\/\/" << filePath << "#offset=" << Offset - << "&size=" << Size << "\n"; - } -}; - -/// Fat binary embedded in object files in clang-offload-bundler format -class OffloadBundleFatBin { - -private: - uint64_t Size = 0u; - StringRef FileName; - uint64_t NumberOfEntries; - SmallVector Entries; - -public: - SmallVector getEntries() { return Entries; } - uint64_t getSize() const { return Size; } - StringRef getFileName() const { return FileName; } - uint64_t getNumEntries() const { return NumberOfEntries; } - - static Expected> - create(MemoryBufferRef, uint64_t SectionOffset, StringRef fileName); - Error extractBundle(const ObjectFile &Source); - - Error DumpEntryToCodeObject(); - - Error ReadEntries(StringRef Section, uint64_t SectionOffset); - void DumpEntries() { - SmallVectorImpl::iterator it = Entries.begin(); - for (uint64_t I = 0; I < Entries.size(); I++) { - it->dumpInfo(outs()); - ++it; - } - } - - void PrintEntriesAsURI() { - SmallVectorImpl::iterator it = Entries.begin(); - for (uint64_t I = 0; I < NumberOfEntries; I++) { - it->dumpURI(outs(), FileName); - ++it; - } - } - - OffloadBundleFatBin(MemoryBufferRef Source, StringRef file) : FileName(file) { - NumberOfEntries = 0; - Entries = SmallVector(); - } - - SmallVector EntryIDContains(StringRef str) { - SmallVector found = SmallVector(); - SmallVectorImpl::iterator it = Entries.begin(); - for (uint64_t I = 0; I < NumberOfEntries; I++) { - if (it->ID.contains(str)) { - found.push_back(*it); - } - - ++it; - } - return found; - } -}; - -enum uri_type_t { FILE_URI, MEMORY_URI }; - -struct OffloadBundleURI { - int64_t Offset = 0; - int64_t Size = 0; - uint64_t ProcessID = 0; - StringRef FileName; - uri_type_t URIType; - - // Constructors - // TODO: add a Copy ctor ? - OffloadBundleURI(StringRef file, int64_t off, int64_t size) - : Offset(off), Size(size), ProcessID(0), FileName(file), - URIType(FILE_URI) {} - - OffloadBundleURI(StringRef str, uri_type_t type) { - URIType = type; - switch (URIType) { - case FILE_URI: - parseFileName(str); - break; - case MEMORY_URI: - parseMemoryURI(str); - break; - default: - report_fatal_error("Unrecognized URI type."); - } - } - - void parseFileName(StringRef str) { - ProcessID = 0; - URIType = FILE_URI; - if (str.consume_front("file://")) { - StringRef FilePathname = - str.take_until([](char c) { return (c == '#') || (c == '?'); }); - FileName = FilePathname; - str = str.drop_front(FilePathname.size()); - - if (str.consume_front("#offset=")) { - StringRef OffsetStr = str.take_until([](char c) { return c == '&'; }); - OffsetStr.getAsInteger(10, Offset); - str = str.drop_front(OffsetStr.size()); - - if (str.consume_front("&size=")) { - Size; - str.getAsInteger(10, Size); - } else - report_fatal_error("Reading 'size' in URI."); - } else - report_fatal_error("Reading 'offset' in URI."); - } else - report_fatal_error("Reading type of URI."); - } - - void parseMemoryURI(StringRef str) { - // TODO: add parseMemoryURI type - } - - StringRef getFileName() const { return FileName; } -}; - /// Extracts embedded device offloading code from a memory \p Buffer to a list /// of \p Binaries. Error extractOffloadBinaries(MemoryBufferRef Buffer, SmallVectorImpl &Binaries); -/// Extracts fat binary in binary clang-offload-bundler format from object \p -/// Obj and return it in \p Bundles -Error extractOffloadBundleFatBinary( - const ObjectFile &Obj, SmallVectorImpl &Bundles); - -/// Extract code object memory from the given \p Source object file at \p Offset -/// and of \p Size, and copy into \p OutputFileName. -Error extractCodeObject(const ObjectFile &Source, int64_t Offset, int64_t Size, - StringRef OutputFileName); - -/// Extracts an Offload Bundle Entry given by URI -Error extractOffloadBundleByURI(StringRef URIstr); - /// Convert a string \p Name to an image kind. ImageKind getImageKind(StringRef Name); diff --git a/llvm/include/llvm/Object/OffloadBundle.h b/llvm/include/llvm/Object/OffloadBundle.h new file mode 100644 index 0000000000000..2208392b11307 --- /dev/null +++ b/llvm/include/llvm/Object/OffloadBundle.h @@ -0,0 +1,228 @@ +//===- OffloadBundle.h - Utilities for offload bundles---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-------------------------------------------------------------------------===// +// +// This file contains the binary format used for budingling device metadata with +// an associated device image. The data can then be stored inside a host object +// file to create a fat binary and read by the linker. This is intended to be a +// thin wrapper around the image itself. If this format becomes sufficiently +// complex it should be moved to a standard binary format like msgpack or ELF. +// +//===-------------------------------------------------------------------------===// + +#ifndef LLVM_OBJECT_OFFLOADBUNDLE_H +#define LLVM_OBJECT_OFFLOADBUNDLE_H + +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Object/Binary.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Compression.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/MemoryBuffer.h" +#include + +namespace llvm { + +namespace object { + +// CompressedOffloadBundle represents the format for the compressed offload +// bundles. +// +// The format is as follows: +// - Magic Number (4 bytes) - A constant "CCOB". +// - Version (2 bytes) +// - Compression Method (2 bytes) - Uses the values from +// llvm::compression::Format. +// - Total file size (4 bytes in V2, 8 bytes in V3). +// - Uncompressed Size (4 bytes in V1/V2, 8 bytes in V3). +// - Truncated MD5 Hash (8 bytes). +// - Compressed Data (variable length). +class CompressedOffloadBundle { +private: + static inline const llvm::StringRef MagicNumber = "CCOB"; + +public: + struct CompressedBundleHeader { + unsigned Version; + llvm::compression::Format CompressionFormat; + std::optional FileSize; + size_t UncompressedFileSize; + uint64_t Hash; + + static llvm::Expected tryParse(llvm::StringRef); + }; + + static inline const uint16_t DefaultVersion = 2; + + static llvm::Expected> + compress(llvm::compression::Params P, const llvm::MemoryBuffer &Input, + uint16_t Version, bool Verbose = false); + static llvm::Expected> + decompress(const llvm::MemoryBuffer &Input, bool Verbose = false); +}; + +/// Bundle entry in binary clang-offload-bundler format. +struct OffloadBundleEntry { + uint64_t Offset = 0u; + uint64_t Size = 0u; + uint64_t IDLength = 0u; + std::string ID; + OffloadBundleEntry(uint64_t O, uint64_t S, uint64_t I, std::string T) + : Offset(O), Size(S), IDLength(I) { + ID.reserve(T.size()); + ID = T; + } + void dumpInfo(raw_ostream &OS) { + OS << "Offset = " << Offset << ", Size = " << Size + << ", ID Length = " << IDLength << ", ID = " << ID << "\n"; + } + void dumpURI(raw_ostream &OS, StringRef FilePath) { + OS << ID.data() << "\tfile://" << FilePath << "#offset=" << Offset + << "&size=" << Size << "\n"; + } +}; + +/// Fat binary embedded in object files in clang-offload-bundler format +class OffloadBundleFatBin { + + uint64_t Size = 0u; + StringRef FileName; + uint64_t NumberOfEntries; + SmallVector Entries; + bool Decompressed; + +public: + std::unique_ptr DecompressedBuffer; + + SmallVector getEntries() { return Entries; } + uint64_t getSize() const { return Size; } + StringRef getFileName() const { return FileName; } + uint64_t getNumEntries() const { return NumberOfEntries; } + bool isDecompressed() const { return Decompressed; } + + LLVM_ABI static Expected> + create(MemoryBufferRef, uint64_t SectionOffset, StringRef FileName, + bool Decompress = false); + LLVM_ABI Error extractBundle(const ObjectFile &Source); + + Error dumpEntryToCodeObject(); + + Error readEntries(StringRef Section, uint64_t SectionOffset); + void dumpEntries() { + for (OffloadBundleEntry &Entry : Entries) + Entry.dumpInfo(outs()); + } + + void printEntriesAsURI() { + for (OffloadBundleEntry &Entry : Entries) + Entry.dumpURI(outs(), FileName); + } + + OffloadBundleFatBin(MemoryBufferRef Source, StringRef File, + bool Decompress = false) + : FileName(File), NumberOfEntries(0), + Entries(SmallVector()), Decompressed(Decompress) { + if (Decompress) { + DecompressedBuffer = + MemoryBuffer::getMemBufferCopy(Source.getBuffer(), File); + } + } +}; + +enum UriTypeT { FILE_URI, MEMORY_URI }; + +struct OffloadBundleURI { + int64_t Offset = 0; + int64_t Size = 0; + uint64_t ProcessID = 0; + StringRef FileName; + UriTypeT URIType; + + // Constructors + // TODO: add a Copy ctor ? + OffloadBundleURI(StringRef File, int64_t Off, int64_t Size) + : Offset(Off), Size(Size), ProcessID(0), FileName(File), + URIType(FILE_URI) {} + +public: + static Expected> + createOffloadBundleURI(StringRef Str, UriTypeT Type) { + switch (Type) { + case FILE_URI: + return createFileURI(Str); + break; + case MEMORY_URI: + return createMemoryURI(Str); + break; + } + llvm_unreachable("Unknown UriTypeT enum"); + } + + static Expected> + createFileURI(StringRef Str) { + int64_t O = 0; + int64_t S = 0; + + if (!Str.consume_front("file://")) + return createStringError(object_error::parse_failed, + "Reading type of URI"); + + StringRef FilePathname = + Str.take_until([](char C) { return (C == '#') || (C == '?'); }); + Str = Str.drop_front(FilePathname.size()); + + if (!Str.consume_front("#offset=")) + return createStringError(object_error::parse_failed, + "Reading 'offset' in URI"); + + StringRef OffsetStr = Str.take_until([](char C) { return C == '&'; }); + OffsetStr.getAsInteger(10, O); + Str = Str.drop_front(OffsetStr.size()); + + if (!Str.consume_front("&size=")) + return createStringError(object_error::parse_failed, + "Reading 'size' in URI"); + + Str.getAsInteger(10, S); + std::unique_ptr OffloadingURI( + new OffloadBundleURI(FilePathname, O, S)); + return std::move(OffloadingURI); + } + + static Expected> + createMemoryURI(StringRef Str) { + // TODO: add parseMemoryURI type + return createStringError(object_error::parse_failed, + "Memory Type URI is not currently supported."); + } + + StringRef getFileName() const { return FileName; } +}; + +/// Extracts fat binary in binary clang-offload-bundler format from object \p +/// Obj and return it in \p Bundles +Error extractOffloadBundleFatBinary( + const ObjectFile &Obj, SmallVectorImpl &Bundles); + +/// Extract code object memory from the given \p Source object file at \p Offset +/// and of \p Size, and copy into \p OutputFileName. +Error extractCodeObject(const ObjectFile &Source, int64_t Offset, int64_t Size, + StringRef OutputFileName); + +/// Extract code object memory from the given \p Source object file at \p Offset +/// and of \p Size, and copy into \p OutputFileName. +LLVM_ABI Error extractCodeObject(MemoryBufferRef Buffer, int64_t Offset, + int64_t Size, StringRef OutputFileName); +/// Extracts an Offload Bundle Entry given by URI +Error extractOffloadBundleByURI(StringRef URIstr); + +} // namespace object + +} // namespace llvm +#endif diff --git a/llvm/include/llvm/Support/Caching.h b/llvm/include/llvm/Support/Caching.h index cf45145619d95..120bcd9da02ed 100644 --- a/llvm/include/llvm/Support/Caching.h +++ b/llvm/include/llvm/Support/Caching.h @@ -30,6 +30,7 @@ class CachedFileStream { CachedFileStream(std::unique_ptr OS, std::string OSPath = "") : OS(std::move(OS)), ObjectPathName(OSPath) {} + virtual Error commit() { return Error::success(); } std::unique_ptr OS; std::string ObjectPathName; virtual ~CachedFileStream() = default; diff --git a/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h b/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h index 5ff38bdf04812..27195051ed7eb 100644 --- a/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h +++ b/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h @@ -40,6 +40,13 @@ class HipStdParAllocationInterpositionPass static bool isRequired() { return true; } }; +class HipStdParMathFixupPass : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); + + static bool isRequired() { return true; } +}; + } // namespace llvm #endif // LLVM_TRANSFORMS_HIPSTDPAR_HIPSTDPAR_H diff --git a/llvm/lib/Analysis/ScopedNoAliasAA.cpp b/llvm/lib/Analysis/ScopedNoAliasAA.cpp index 3815bdf49d59c..59e1179119160 100644 --- a/llvm/lib/Analysis/ScopedNoAliasAA.cpp +++ b/llvm/lib/Analysis/ScopedNoAliasAA.cpp @@ -114,6 +114,18 @@ static void collectMDInDomain(const MDNode *List, const MDNode *Domain, Nodes.insert(MD); } +/// Collect the set of scoped domains relevant to the noalias scopes. +void ScopedNoAliasAAResult::collectScopedDomains( + const MDNode *NoAlias, SmallPtrSetImpl &Domains) const { + if (!NoAlias) + return; + assert(Domains.empty() && "Domains should be empty"); + for (const MDOperand &MDOp : NoAlias->operands()) + if (const MDNode *NAMD = dyn_cast(MDOp)) + if (const MDNode *Domain = AliasScopeNode(NAMD).getDomain()) + Domains.insert(Domain); +} + bool ScopedNoAliasAAResult::mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias) const { if (!Scopes || !NoAlias) @@ -121,10 +133,7 @@ bool ScopedNoAliasAAResult::mayAliasInScopes(const MDNode *Scopes, // Collect the set of scope domains relevant to the noalias scopes. SmallPtrSet Domains; - for (const MDOperand &MDOp : NoAlias->operands()) - if (const MDNode *NAMD = dyn_cast(MDOp)) - if (const MDNode *Domain = AliasScopeNode(NAMD).getDomain()) - Domains.insert(Domain); + collectScopedDomains(NoAlias, Domains); // We alias unless, for some domain, the set of noalias scopes in that domain // is a superset of the set of alias scopes in that domain. diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index b148b8e3a8562..dcb035e3cad28 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -2018,7 +2018,7 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU, if (ProcessedLifetimes.insert(L).second) { if (auto *AddCU = dyn_cast(GV->getScope())) { AddCULifetimeMap[AddCU].push_back(L); - } else if (auto *AddNS = dyn_cast(GV->getScope())) { + } else if (isa(GV->getScope())) { // FIXME(KZHURAVL): Properly support DINamespace. } else if (auto *AddSP = dyn_cast(GV->getScope())) { SPLifetimeMap[AddSP].push_back(L); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index 9afad8369e114..d656324c4e128 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -991,11 +991,9 @@ std::optional DwarfExpression::traverse(DIOp::Arg Arg, if (IsFragment) emitOp(dwarf::DW_OP_lit0); - unsigned RegSize = 0; for (auto &Reg : Regs) { if (Reg.SubRegSize % 8) return std::nullopt; - RegSize += Reg.SubRegSize; if (Reg.DwarfRegNo >= 0) addReg(Reg.DwarfRegNo, Reg.Comment); emitOp(dwarf::DW_OP_piece); diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index 365c1194aa4ca..53b12678d04d0 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -156,6 +156,15 @@ static cl::opt cl::desc("livedebugvalues-stack-ws-limit"), cl::init(250)); +// Limit for the maximum number of stack slot indexes. On targets where this is +// exceeded, this effectivly disables tracking debug locations across spills. +// The spill tracking in MLocTracker performs quite poorly in terms of memory +// and time on targets with a more complicated register file (FIXME). +static cl::opt + StackSlotIdxesLimit("livedebugvalues-max-stack-slot-idxes", cl::Hidden, + cl::desc("livedebugvalues-max-stack-slot-idxes"), + cl::init(128)); + DbgOpID DbgOpID::UndefID = DbgOpID(0xffffffff); /// Tracker for converting machine value locations and variable values into @@ -1122,6 +1131,10 @@ void MLocTracker::writeRegMask(const MachineOperand *MO, unsigned CurBB, } std::optional MLocTracker::getOrTrackSpillLoc(SpillLoc L) { + // Disable spill tracking on targets with a large number of slot idxes. + if (NumSlotIdxes >= StackSlotIdxesLimit) + return std::nullopt; + SpillLocationNo SpillID(SpillLocs.idFor(L)); if (SpillID.id() == 0) { @@ -3721,6 +3734,15 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, VTracker = nullptr; TTracker = nullptr; + if (MTracker->NumSlotIdxes >= StackSlotIdxesLimit) { + LLVM_DEBUG( + dbgs() << "Disabling InstrRefBasedLDV spill tracking for " + << MF.getName() + << " since target has too many potential stack slot indexes (" + << MTracker->NumSlotIdxes << ", limit is " << StackSlotIdxesLimit + << ")\n"); + } + SmallVector MLocTransfer; SmallVector vlocs; LiveInsT SavedLiveIns; diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index d44b064dcb4b2..7cb2b9e730d51 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -1009,18 +1009,30 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { continue; } - LLVM_DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: "; - MaybeDead->dump()); - // Invalidate all entries in the copy map which are not preserved by // this register mask. - for (unsigned RegUnit : TRI->regunits(Reg)) + bool MIRefedinCopyInfo = false; + for (unsigned RegUnit : TRI->regunits(Reg)) { if (!PreservedRegUnits.test(RegUnit)) Tracker.clobberRegUnit(RegUnit, *TRI, *TII, UseCopyInstr); + else { + if (MaybeDead == Tracker.findCopyForUnit(RegUnit, *TRI)) { + MIRefedinCopyInfo = true; + } + } + } // erase() will return the next valid iterator pointing to the next // element after the erased one. DI = MaybeDeadCopies.erase(DI); + + // Preserved by RegMask, DO NOT remove copy + if (MIRefedinCopyInfo) + continue; + + LLVM_DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: " + << *MaybeDead); + MaybeDead->eraseFromParent(); Changed = true; ++NumDeletes; diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index 836dbd5ecf500..14a6f43f7d5a0 100644 --- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -69,6 +69,14 @@ static cl::opt static cl::opt UseTBAA("use-tbaa-in-sched-mi", cl::Hidden, cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction")); +static cl::opt + EnableSchedModel("schedmodel", cl::Hidden, cl::init(true), + cl::desc("Use TargetSchedModel for latency lookup")); + +static cl::opt + EnableSchedItins("scheditins", cl::Hidden, cl::init(true), + cl::desc("Use InstrItineraryData for latency lookup")); + // Note: the two options below might be used in tuning compile time vs // output quality. Setting HugeRegion so large that it will never be // reached means best-effort, but may be slow. @@ -121,7 +129,7 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, DbgValues.clear(); const TargetSubtargetInfo &ST = mf.getSubtarget(); - SchedModel.init(&ST); + SchedModel.init(&ST, EnableSchedModel, EnableSchedItins); } /// If this machine instr has memory reference information and it can be diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 04bec16330f90..ac15319f658a1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -28265,7 +28265,8 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, if ((Fold || Swap) && TLI.getBooleanContents(CmpOpVT) == TargetLowering::ZeroOrOneBooleanContent && - (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT))) { + (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT)) && + TLI.convertSelectOfConstantsToMath(VT)) { if (Swap) { CC = ISD::getSetCCInverse(CC, CmpOpVT); diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index af2320871cf6c..63a047696db15 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1443,7 +1443,7 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { const DbgDefInst &DDI = *cast(II); const Value *Referrer = DDI.getReferrer(); assert(Referrer); - if (const auto *UV = dyn_cast(Referrer)) { + if (isa(Referrer)) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::DBG_DEF)) .addMetadata(DDI.getLifetime()) diff --git a/llvm/lib/CodeGen/TargetSchedule.cpp b/llvm/lib/CodeGen/TargetSchedule.cpp index db884b4940395..7ae9e0e37bbab 100644 --- a/llvm/lib/CodeGen/TargetSchedule.cpp +++ b/llvm/lib/CodeGen/TargetSchedule.cpp @@ -29,12 +29,6 @@ using namespace llvm; -static cl::opt EnableSchedModel("schedmodel", cl::Hidden, cl::init(true), - cl::desc("Use TargetSchedModel for latency lookup")); - -static cl::opt EnableSchedItins("scheditins", cl::Hidden, cl::init(true), - cl::desc("Use InstrItineraryData for latency lookup")); - static cl::opt ForceEnableIntervals( "sched-model-force-enable-intervals", cl::Hidden, cl::init(false), cl::desc("Force the use of resource intervals in the schedule model")); @@ -47,12 +41,16 @@ bool TargetSchedModel::hasInstrItineraries() const { return EnableSchedItins && !InstrItins.isEmpty(); } -void TargetSchedModel::init(const TargetSubtargetInfo *TSInfo) { +void TargetSchedModel::init(const TargetSubtargetInfo *TSInfo, + bool EnableSModel, bool EnableSItins) { STI = TSInfo; SchedModel = TSInfo->getSchedModel(); TII = TSInfo->getInstrInfo(); STI->initInstrItins(InstrItins); + EnableSchedModel = EnableSModel; + EnableSchedItins = EnableSItins; + unsigned NumRes = SchedModel.getNumProcResourceKinds(); ResourceFactors.resize(NumRes); ResourceLCM = SchedModel.IssueWidth; diff --git a/llvm/lib/Debuginfod/Debuginfod.cpp b/llvm/lib/Debuginfod/Debuginfod.cpp index 4c785117ae8ef..78dbfb1b1f15d 100644 --- a/llvm/lib/Debuginfod/Debuginfod.cpp +++ b/llvm/lib/Debuginfod/Debuginfod.cpp @@ -188,6 +188,7 @@ class StreamedHTTPResponseHandler : public HTTPResponseHandler { public: StreamedHTTPResponseHandler(CreateStreamFn CreateStream, HTTPClient &Client) : CreateStream(CreateStream), Client(Client) {} + Error commit(); virtual ~StreamedHTTPResponseHandler() = default; Error handleBodyChunk(StringRef BodyChunk) override; @@ -210,6 +211,12 @@ Error StreamedHTTPResponseHandler::handleBodyChunk(StringRef BodyChunk) { return Error::success(); } +Error StreamedHTTPResponseHandler::commit() { + if (FileStream) + return FileStream->commit(); + return Error::success(); +} + // An over-accepting simplification of the HTTP RFC 7230 spec. static bool isHeader(StringRef S) { StringRef Name; @@ -298,6 +305,8 @@ Expected getCachedOrDownloadArtifact( Error Err = Client.perform(Request, Handler); if (Err) return std::move(Err); + if ((Err = Handler.commit())) + return std::move(Err); unsigned Code = Client.responseCode(); if (Code && Code != 200) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 88be962dd187e..b98fc35f80488 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -313,7 +313,7 @@ static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL) { } void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, - bool CreateBranch) { + bool CreateBranch, DebugLoc DL) { assert(New->getFirstInsertionPt() == New->begin() && "Target BB must not have PHI nodes"); @@ -321,15 +321,17 @@ void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, BasicBlock *Old = IP.getBlock(); New->splice(New->begin(), Old, IP.getPoint(), Old->end()); - if (CreateBranch) - BranchInst::Create(New, Old); + if (CreateBranch) { + auto *NewBr = BranchInst::Create(New, Old); + NewBr->setDebugLoc(DL); + } } void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) { DebugLoc DebugLoc = Builder.getCurrentDebugLocation(); BasicBlock *Old = Builder.GetInsertBlock(); - spliceBB(Builder.saveIP(), New, CreateBranch); + spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc); if (CreateBranch) Builder.SetInsertPoint(Old->getTerminator()); else @@ -341,12 +343,12 @@ void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) { } BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, - llvm::Twine Name) { + DebugLoc DL, llvm::Twine Name) { BasicBlock *Old = IP.getBlock(); BasicBlock *New = BasicBlock::Create( Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name, Old->getParent(), Old->getNextNode()); - spliceBB(IP, New, CreateBranch); + spliceBB(IP, New, CreateBranch, DL); New->replaceSuccessorsPhiUsesWith(Old, New); return New; } @@ -354,7 +356,7 @@ BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Name) { DebugLoc DebugLoc = Builder.getCurrentDebugLocation(); - BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name); + BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name); if (CreateBranch) Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator()); else @@ -368,7 +370,7 @@ BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch, BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch, llvm::Twine Name) { DebugLoc DebugLoc = Builder.getCurrentDebugLocation(); - BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name); + BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name); if (CreateBranch) Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator()); else @@ -4214,7 +4216,11 @@ Expected OpenMPIRBuilder::createCanonicalLoop( Value *IndVar = Builder.CreateAdd(Span, Start); return BodyGenCB(Builder.saveIP(), IndVar); }; - LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP(); + LocationDescription LoopLoc = + ComputeIP.isSet() + ? Loc + : LocationDescription(Builder.saveIP(), + Builder.getCurrentDebugLocation()); return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name); } @@ -5413,7 +5419,7 @@ void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop, Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock); InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()}; // Then block contains branch to omp loop which needs to be vectorized - spliceBB(IP, ThenBlock, false); + spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation()); ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock); Builder.SetInsertPoint(ElseBlock); diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp index 26c183378233d..dca42a57fa9e3 100644 --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -47,7 +47,7 @@ cl::opt PreserveInputDbgFormat( bool WriteNewDbgInfoFormatToBitcode /*set default value in cl::init() below*/; cl::opt WriteNewDbgInfoFormatToBitcode2( "write-experimental-debuginfo-iterators-to-bitcode", cl::Hidden, - cl::location(WriteNewDbgInfoFormatToBitcode), cl::init(false)); + cl::location(WriteNewDbgInfoFormatToBitcode), cl::init(true)); DbgMarker *BasicBlock::createMarker(Instruction *I) { assert(IsNewDbgInfoFormat && diff --git a/llvm/lib/IR/TypeFinder.cpp b/llvm/lib/IR/TypeFinder.cpp index 7429073e9e042..72a1120db7300 100644 --- a/llvm/lib/IR/TypeFinder.cpp +++ b/llvm/lib/IR/TypeFinder.cpp @@ -99,7 +99,11 @@ void TypeFinder::run(const Module &M, bool onlyNamed) { if (DVI->isDbgAssign()) { if (Value *Addr = DVI->getAddress()) incorporateValue(Addr); + if (auto *Expr = DVI->getRawAddressExpression()) + incorporateMDNode(Expr); } + if (auto *Expr = DVI->getRawExpression()) + incorporateMDNode(Expr); } } } diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 3c150b1487b75..37852bdc23265 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -477,6 +477,9 @@ static void codegen(const CodegenConfig &Conf, TargetMachine *TM, if (DwoOut) DwoOut->keep(); + + if (Error Err = Stream->commit()) + report_fatal_error(std::move(Err)); } static void splitCodeGen(const CodegenConfig &CodegenC, TargetMachine *TM, diff --git a/llvm/lib/Object/CMakeLists.txt b/llvm/lib/Object/CMakeLists.txt index bfb420e57a7f4..870169a83174f 100644 --- a/llvm/lib/Object/CMakeLists.txt +++ b/llvm/lib/Object/CMakeLists.txt @@ -22,6 +22,7 @@ add_llvm_component_library(LLVMObject Object.cpp ObjectFile.cpp OffloadBinary.cpp + OffloadBundle.cpp RecordStreamer.cpp RelocationResolver.cpp SymbolicFile.cpp diff --git a/llvm/lib/Object/OffloadBinary.cpp b/llvm/lib/Object/OffloadBinary.cpp index f3d2480eaca3c..b529ea93d393b 100644 --- a/llvm/lib/Object/OffloadBinary.cpp +++ b/llvm/lib/Object/OffloadBinary.cpp @@ -103,48 +103,6 @@ Error extractFromObject(const ObjectFile &Obj, return Error::success(); } -// Extract an Offload bundle (usually a Offload Bundle) from a fat_bin -// section -Error extractOffloadBundle(MemoryBufferRef Contents, uint64_t SectionOffset, - StringRef fileName, - SmallVectorImpl &Bundles) { - - uint64_t Offset = 0; - int64_t nextbundleStart = 0; - - // There could be multiple offloading bundles stored at this section. - while (nextbundleStart >= 0) { - - std::unique_ptr Buffer = - MemoryBuffer::getMemBuffer(Contents.getBuffer().drop_front(Offset), "", - /*RequiresNullTerminator*/ false); - - // Create the FatBinBindle object. This will also create the Bundle Entry - // list info. - auto FatBundleOrErr = - OffloadBundleFatBin::create(*Buffer, SectionOffset + Offset, fileName); - if (!FatBundleOrErr) - return FatBundleOrErr.takeError(); - OffloadBundleFatBin &Bundle = **FatBundleOrErr; - - // add current Bundle to list. - Bundles.emplace_back(std::move(**FatBundleOrErr)); - - // find the next bundle by searching for the magic string - StringRef str = Buffer->getBuffer(); - nextbundleStart = - (int64_t)str.find(StringRef("__CLANG_OFFLOAD_BUNDLE__"), 24); - - if (nextbundleStart >= 0) - Offset += nextbundleStart; - else { - return Error::success(); - } - } // end of while loop - - return Error::success(); -} - Error extractFromBitcode(MemoryBufferRef Buffer, SmallVectorImpl &Binaries) { LLVMContext Context; @@ -216,102 +174,6 @@ Error extractFromArchive(const Archive &Library, } // namespace -Error OffloadBundleFatBin::ReadEntries(StringRef Buffer, - uint64_t SectionOffset) { - uint64_t BundleNumber = 0; - uint64_t NumOfEntries = 0; - - // get Reader - BinaryStreamReader Reader(Buffer, llvm::endianness::little); - - // Read the Magic String first. - StringRef Magic; - if (auto EC = Reader.readFixedString(Magic, 24)) { - return errorCodeToError(object_error::parse_failed); - } - - // read the number of Code Objects (Entries) in the current Bundle. - if (auto EC = Reader.readInteger(NumOfEntries)) { - printf("OffloadBundleFatBin::ReadEntries .... failed to read number of " - "Entries\n"); - return errorCodeToError(object_error::parse_failed); - } - NumberOfEntries = NumOfEntries; - - // For each Bundle Entry (code object) - for (uint64_t I = 0; I < NumOfEntries; I++) { - uint64_t EntrySize; - uint64_t EntryOffset; - uint64_t EntryIDSize; - StringRef EntryID; - uint64_t absOffset; - - if (auto EC = Reader.readInteger(EntryOffset)) { - return errorCodeToError(object_error::parse_failed); - } - - if (auto EC = Reader.readInteger(EntrySize)) { - return errorCodeToError(object_error::parse_failed); - } - - if (auto EC = Reader.readInteger(EntryIDSize)) { - return errorCodeToError(object_error::parse_failed); - } - - if (auto EC = Reader.readFixedString(EntryID, EntryIDSize)) { - return errorCodeToError(object_error::parse_failed); - } - - // create a Bundle Entry object: - auto entry = new OffloadBundleEntry(EntryOffset + SectionOffset, EntrySize, - EntryIDSize, EntryID); - - Entries.push_back(*entry); - } // end of for loop - - return Error::success(); -} - -Expected> -OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset, - StringRef fileName) { - if (Buf.getBufferSize() < 24) - return errorCodeToError(object_error::parse_failed); - - // Check for magic bytes. - if (identify_magic(Buf.getBuffer()) != file_magic::offload_bundle) - return errorCodeToError(object_error::parse_failed); - - OffloadBundleFatBin *TheBundle = new OffloadBundleFatBin(Buf, fileName); - - // Read the Bundle Entries - Error Err = TheBundle->ReadEntries(Buf.getBuffer(), SectionOffset); - if (Err) - return errorCodeToError(object_error::parse_failed); - - return std::unique_ptr(TheBundle); -} - -Error OffloadBundleFatBin::extractBundle(const ObjectFile &Source) { - // This will extract all entries in the Bundle - SmallVectorImpl::iterator it = Entries.begin(); - for (int64_t I = 0; I < getNumEntries(); I++) { - - if (it->Size > 0) { - // create output file name. Which should be - // -offset-size.co" - std::string str = getFileName().str() + "-offset" + itostr(it->Offset) + - "-size" + itostr(it->Size) + ".co"; - if (Error Err = object::extractCodeObject(Source, it->Offset, it->Size, - StringRef(str))) - return Err; - } - ++it; - } - - return Error::success(); -} - Expected> OffloadBinary::create(MemoryBufferRef Buf) { if (Buf.getBufferSize() < sizeof(Header) + sizeof(Entry)) @@ -441,104 +303,6 @@ Error object::extractOffloadBinaries(MemoryBufferRef Buffer, } } -Error object::extractOffloadBundleFatBinary( - const ObjectFile &Obj, SmallVectorImpl &Bundles) { - assert((Obj.isELF() || Obj.isCOFF()) && "Invalid file type"); - - // iterate through Sections until we find an offload_bundle section. - for (SectionRef Sec : Obj.sections()) { - Expected Buffer = Sec.getContents(); - if (!Buffer) - return Buffer.takeError(); - - // If it does not start with the reserved suffix, just skip this section. - if ((llvm::identify_magic(*Buffer) == llvm::file_magic::offload_bundle) || - (llvm::identify_magic(*Buffer) == - llvm::file_magic::offload_bundle_compressed)) { - - uint64_t SectionOffset = 0; - if (Obj.isELF()) { - SectionOffset = ELFSectionRef(Sec).getOffset(); - } else if (Obj.isCOFF()) { - if (const COFFObjectFile *COFFObj = dyn_cast(&Obj)) { - const coff_section *CoffSection = COFFObj->getCOFFSection(Sec); - } - } - - MemoryBufferRef Contents(*Buffer, Obj.getFileName()); - - if (llvm::identify_magic(*Buffer) == - llvm::file_magic::offload_bundle_compressed) { - // Decompress the input if necessary. - Expected> DecompressedBufferOrErr = - CompressedOffloadBundle::decompress(Contents, false); - - if (!DecompressedBufferOrErr) - return createStringError( - inconvertibleErrorCode(), - "Failed to decompress input: " + - llvm::toString(DecompressedBufferOrErr.takeError())); - - MemoryBuffer &DecompressedInput = **DecompressedBufferOrErr; - if (Error Err = extractOffloadBundle(DecompressedInput, SectionOffset, - Obj.getFileName(), Bundles)) - return Err; - } else { - if (Error Err = extractOffloadBundle(Contents, SectionOffset, - Obj.getFileName(), Bundles)) - return Err; - } - } - } - return Error::success(); -} - -Error object::extractCodeObject(const ObjectFile &Source, int64_t Offset, - int64_t Size, StringRef OutputFileName) { - Expected> BufferOrErr = - FileOutputBuffer::create(OutputFileName, Size); - - if (!BufferOrErr) - return BufferOrErr.takeError(); - - Expected InputBuffOrErr = Source.getMemoryBufferRef(); - if (Error Err = InputBuffOrErr.takeError()) - return Err; - - std::unique_ptr Buf = std::move(*BufferOrErr); - std::copy(InputBuffOrErr->getBufferStart() + Offset, - InputBuffOrErr->getBufferStart() + Offset + Size, - Buf->getBufferStart()); - if (Error E = Buf->commit()) - return E; - - return Error::success(); -} - -// given a file name, offset, and size, extract data into a code object file, -// into file -offset-size.co -Error object::extractOffloadBundleByURI(StringRef URIstr) { - // create a URI object - object::OffloadBundleURI *uri = - new object::OffloadBundleURI(URIstr, FILE_URI); - - std::string OutputFile = uri->FileName.str(); - OutputFile += - "-offset" + itostr(uri->Offset) + "-size" + itostr(uri->Size) + ".co"; - - // Create an ObjectFile object from uri.file_uri - auto ObjOrErr = ObjectFile::createObjectFile(uri->FileName); - if (!ObjOrErr) - return ObjOrErr.takeError(); - - auto Obj = ObjOrErr->getBinary(); - if (Error Err = - object::extractCodeObject(*Obj, uri->Offset, uri->Size, OutputFile)) - return Err; - - return Error::success(); -} - OffloadKind object::getOffloadKind(StringRef Name) { return llvm::StringSwitch(Name) .Case("openmp", OFK_OpenMP) @@ -622,223 +386,3 @@ bool object::areTargetsCompatible(const OffloadFile::TargetID &LHS, return false; return true; } - -// Utility function to format numbers with commas -static std::string formatWithCommas(unsigned long long Value) { - std::string Num = std::to_string(Value); - int InsertPosition = Num.length() - 3; - while (InsertPosition > 0) { - Num.insert(InsertPosition, ","); - InsertPosition -= 3; - } - return Num; -} - -llvm::Expected> -CompressedOffloadBundle::decompress(llvm::MemoryBufferRef &Input, - - bool Verbose) { - StringRef Blob = Input.getBuffer(); - - if (Blob.size() < V1HeaderSize) - return llvm::MemoryBuffer::getMemBufferCopy(Blob); - - if (llvm::identify_magic(Blob) != - llvm::file_magic::offload_bundle_compressed) { - if (Verbose) - llvm::errs() << "Uncompressed bundle.\n"; - return llvm::MemoryBuffer::getMemBufferCopy(Blob); - } - - size_t CurrentOffset = MagicSize; - - uint16_t ThisVersion; - memcpy(&ThisVersion, Blob.data() + CurrentOffset, sizeof(uint16_t)); - CurrentOffset += VersionFieldSize; - - uint16_t CompressionMethod; - memcpy(&CompressionMethod, Blob.data() + CurrentOffset, sizeof(uint16_t)); - CurrentOffset += MethodFieldSize; - - uint32_t TotalFileSize; - if (ThisVersion >= 2) { - if (Blob.size() < V2HeaderSize) - return createStringError(inconvertibleErrorCode(), - "Compressed bundle header size too small"); - memcpy(&TotalFileSize, Blob.data() + CurrentOffset, sizeof(uint32_t)); - CurrentOffset += FileSizeFieldSize; - } - - uint32_t UncompressedSize; - memcpy(&UncompressedSize, Blob.data() + CurrentOffset, sizeof(uint32_t)); - CurrentOffset += UncompressedSizeFieldSize; - - uint64_t StoredHash; - memcpy(&StoredHash, Blob.data() + CurrentOffset, sizeof(uint64_t)); - CurrentOffset += HashFieldSize; - - llvm::compression::Format CompressionFormat; - if (CompressionMethod == - static_cast(llvm::compression::Format::Zlib)) - CompressionFormat = llvm::compression::Format::Zlib; - else if (CompressionMethod == - static_cast(llvm::compression::Format::Zstd)) - CompressionFormat = llvm::compression::Format::Zstd; - else - return createStringError(inconvertibleErrorCode(), - "Unknown compressing method"); - - llvm::Timer DecompressTimer("Decompression Timer", "Decompression time", - OffloadBundlerTimerGroup); - if (Verbose) - DecompressTimer.startTimer(); - - SmallVector DecompressedData; - StringRef CompressedData = Blob.substr(CurrentOffset); - if (llvm::Error DecompressionError = llvm::compression::decompress( - CompressionFormat, llvm::arrayRefFromStringRef(CompressedData), - DecompressedData, UncompressedSize)) - return createStringError(inconvertibleErrorCode(), - "Could not decompress embedded file contents: " + - llvm::toString(std::move(DecompressionError))); - - if (Verbose) { - DecompressTimer.stopTimer(); - - double DecompressionTimeSeconds = - DecompressTimer.getTotalTime().getWallTime(); - - // Recalculate MD5 hash for integrity check - llvm::Timer HashRecalcTimer("Hash Recalculation Timer", - "Hash recalculation time", - OffloadBundlerTimerGroup); - HashRecalcTimer.startTimer(); - llvm::MD5 Hash; - llvm::MD5::MD5Result Result; - Hash.update(llvm::ArrayRef(DecompressedData.data(), - DecompressedData.size())); - Hash.final(Result); - uint64_t RecalculatedHash = Result.low(); - HashRecalcTimer.stopTimer(); - bool HashMatch = (StoredHash == RecalculatedHash); - - double CompressionRate = - static_cast(UncompressedSize) / CompressedData.size(); - double DecompressionSpeedMBs = - (UncompressedSize / (1024.0 * 1024.0)) / DecompressionTimeSeconds; - - llvm::errs() << "Compressed bundle format version: " << ThisVersion << "\n"; - if (ThisVersion >= 2) - llvm::errs() << "Total file size (from header): " - << formatWithCommas(TotalFileSize) << " bytes\n"; - llvm::errs() << "Decompression method: " - << (CompressionFormat == llvm::compression::Format::Zlib - ? "zlib" - : "zstd") - << "\n" - << "Size before decompression: " - << formatWithCommas(CompressedData.size()) << " bytes\n" - << "Size after decompression: " - << formatWithCommas(UncompressedSize) << " bytes\n" - << "Compression rate: " - << llvm::format("%.2lf", CompressionRate) << "\n" - << "Compression ratio: " - << llvm::format("%.2lf%%", 100.0 / CompressionRate) << "\n" - << "Decompression speed: " - << llvm::format("%.2lf MB/s", DecompressionSpeedMBs) << "\n" - << "Stored hash: " << llvm::format_hex(StoredHash, 16) << "\n" - << "Recalculated hash: " - << llvm::format_hex(RecalculatedHash, 16) << "\n" - << "Hashes match: " << (HashMatch ? "Yes" : "No") << "\n"; - } - - return llvm::MemoryBuffer::getMemBufferCopy( - llvm::toStringRef(DecompressedData)); -} - -llvm::Expected> -CompressedOffloadBundle::compress(llvm::compression::Params P, - const llvm::MemoryBuffer &Input, - bool Verbose) { - if (!llvm::compression::zstd::isAvailable() && - !llvm::compression::zlib::isAvailable()) - return createStringError(llvm::inconvertibleErrorCode(), - "Compression not supported"); - - llvm::Timer HashTimer("Hash Calculation Timer", "Hash calculation time", - OffloadBundlerTimerGroup); - if (Verbose) - HashTimer.startTimer(); - llvm::MD5 Hash; - llvm::MD5::MD5Result Result; - Hash.update(Input.getBuffer()); - Hash.final(Result); - uint64_t TruncatedHash = Result.low(); - if (Verbose) - HashTimer.stopTimer(); - - SmallVector CompressedBuffer; - auto BufferUint8 = llvm::ArrayRef( - reinterpret_cast(Input.getBuffer().data()), - Input.getBuffer().size()); - - llvm::Timer CompressTimer("Compression Timer", "Compression time", - OffloadBundlerTimerGroup); - if (Verbose) - CompressTimer.startTimer(); - llvm::compression::compress(P, BufferUint8, CompressedBuffer); - if (Verbose) - CompressTimer.stopTimer(); - - uint16_t CompressionMethod = static_cast(P.format); - uint32_t UncompressedSize = Input.getBuffer().size(); - uint32_t TotalFileSize = MagicNumber.size() + sizeof(TotalFileSize) + - sizeof(Version) + sizeof(CompressionMethod) + - sizeof(UncompressedSize) + sizeof(TruncatedHash) + - CompressedBuffer.size(); - - SmallVector FinalBuffer; - llvm::raw_svector_ostream OS(FinalBuffer); - OS << MagicNumber; - OS.write(reinterpret_cast(&Version), sizeof(Version)); - OS.write(reinterpret_cast(&CompressionMethod), - sizeof(CompressionMethod)); - OS.write(reinterpret_cast(&TotalFileSize), - sizeof(TotalFileSize)); - OS.write(reinterpret_cast(&UncompressedSize), - sizeof(UncompressedSize)); - OS.write(reinterpret_cast(&TruncatedHash), - sizeof(TruncatedHash)); - OS.write(reinterpret_cast(CompressedBuffer.data()), - CompressedBuffer.size()); - - if (Verbose) { - auto MethodUsed = - P.format == llvm::compression::Format::Zstd ? "zstd" : "zlib"; - double CompressionRate = - static_cast(UncompressedSize) / CompressedBuffer.size(); - double CompressionTimeSeconds = CompressTimer.getTotalTime().getWallTime(); - double CompressionSpeedMBs = - (UncompressedSize / (1024.0 * 1024.0)) / CompressionTimeSeconds; - - llvm::errs() << "Compressed bundle format version: " << Version << "\n" - << "Total file size (including headers): " - << formatWithCommas(TotalFileSize) << " bytes\n" - << "Compression method used: " << MethodUsed << "\n" - << "Compression level: " << P.level << "\n" - << "Binary size before compression: " - << formatWithCommas(UncompressedSize) << " bytes\n" - << "Binary size after compression: " - << formatWithCommas(CompressedBuffer.size()) << " bytes\n" - << "Compression rate: " - << llvm::format("%.2lf", CompressionRate) << "\n" - << "Compression ratio: " - << llvm::format("%.2lf%%", 100.0 / CompressionRate) << "\n" - << "Compression speed: " - << llvm::format("%.2lf MB/s", CompressionSpeedMBs) << "\n" - << "Truncated MD5 hash: " - << llvm::format_hex(TruncatedHash, 16) << "\n"; - } - return llvm::MemoryBuffer::getMemBufferCopy( - llvm::StringRef(FinalBuffer.data(), FinalBuffer.size())); -} diff --git a/llvm/lib/Object/OffloadBundle.cpp b/llvm/lib/Object/OffloadBundle.cpp new file mode 100644 index 0000000000000..4ee236a1bb469 --- /dev/null +++ b/llvm/lib/Object/OffloadBundle.cpp @@ -0,0 +1,623 @@ +//===- OffloadBundle.cpp - Utilities for offload bundles---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------===// + +#include "llvm/Object/OffloadBundle.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/BinaryFormat/Magic.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Module.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/MC/StringTableBuilder.h" +#include "llvm/Object/Archive.h" +#include "llvm/Object/Binary.h" +#include "llvm/Object/COFF.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/Error.h" +#include "llvm/Object/IRObjectFile.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/Timer.h" + +using namespace llvm; +using namespace llvm::object; + +static llvm::TimerGroup + OffloadBundlerTimerGroup("Offload Bundler Timer Group", + "Timer group for offload bundler"); + +// Extract an Offload bundle (usually a Offload Bundle) from a fat_bin +// section +Error extractOffloadBundle(MemoryBufferRef Contents, uint64_t SectionOffset, + StringRef FileName, + SmallVectorImpl &Bundles) { + + size_t Offset = 0; + size_t NextbundleStart = 0; + StringRef Magic; + std::unique_ptr Buffer; + + // There could be multiple offloading bundles stored at this section. + while ((NextbundleStart != StringRef::npos) && + (Offset < Contents.getBuffer().size())) { + Buffer = + MemoryBuffer::getMemBuffer(Contents.getBuffer().drop_front(Offset), "", + /*RequiresNullTerminator=*/false); + + if (identify_magic((*Buffer).getBuffer()) == + file_magic::offload_bundle_compressed) { + Magic = StringRef("CCOB"); + // decompress this bundle first. + NextbundleStart = (*Buffer).getBuffer().find(Magic, Magic.size()); + if (NextbundleStart == StringRef::npos) { + NextbundleStart = (*Buffer).getBuffer().size(); + } + + ErrorOr> CodeOrErr = + MemoryBuffer::getMemBuffer((*Buffer).getBuffer().take_front( + NextbundleStart /*- Magic.size()*/), + FileName, false); + if (std::error_code EC = CodeOrErr.getError()) + return createFileError(FileName, EC); + + Expected> DecompressedBufferOrErr = + CompressedOffloadBundle::decompress(**CodeOrErr, false); + if (!DecompressedBufferOrErr) + return createStringError( + inconvertibleErrorCode(), + "Failed to decompress input: " + + llvm::toString(DecompressedBufferOrErr.takeError())); + + auto FatBundleOrErr = OffloadBundleFatBin::create( + **DecompressedBufferOrErr, Offset, FileName, true); + if (!FatBundleOrErr) + return FatBundleOrErr.takeError(); + + // Add current Bundle to list. + Bundles.emplace_back(std::move(**FatBundleOrErr)); + + } else if (identify_magic((*Buffer).getBuffer()) == + file_magic::offload_bundle) { + // Create the FatBinBindle object. This will also create the Bundle Entry + // list info. + auto FatBundleOrErr = OffloadBundleFatBin::create( + *Buffer, SectionOffset + Offset, FileName); + if (!FatBundleOrErr) + return FatBundleOrErr.takeError(); + + // Add current Bundle to list. + Bundles.emplace_back(std::move(**FatBundleOrErr)); + + Magic = StringRef("__CLANG_OFFLOAD_BUNDLE__"); + NextbundleStart = (*Buffer).getBuffer().find(Magic, Magic.size()); + } + + if (NextbundleStart != StringRef::npos) + Offset += NextbundleStart; + } + + return Error::success(); +} + +Error OffloadBundleFatBin::readEntries(StringRef Buffer, + uint64_t SectionOffset) { + uint64_t NumOfEntries = 0; + + BinaryStreamReader Reader(Buffer, llvm::endianness::little); + + // Read the Magic String first. + StringRef Magic; + if (auto EC = Reader.readFixedString(Magic, 24)) + return errorCodeToError(object_error::parse_failed); + + // Read the number of Code Objects (Entries) in the current Bundle. + if (auto EC = Reader.readInteger(NumOfEntries)) + return errorCodeToError(object_error::parse_failed); + + NumberOfEntries = NumOfEntries; + + // For each Bundle Entry (code object) + for (uint64_t I = 0; I < NumOfEntries; I++) { + uint64_t EntrySize; + uint64_t EntryOffset; + uint64_t EntryIDSize; + StringRef EntryID; + + if (auto EC = Reader.readInteger(EntryOffset)) + return errorCodeToError(object_error::parse_failed); + + if (auto EC = Reader.readInteger(EntrySize)) + return errorCodeToError(object_error::parse_failed); + + if (auto EC = Reader.readInteger(EntryIDSize)) + return errorCodeToError(object_error::parse_failed); + + if (auto EC = Reader.readFixedString(EntryID, EntryIDSize)) + return errorCodeToError(object_error::parse_failed); + + auto Entry = std::make_unique( + EntryOffset + SectionOffset, EntrySize, EntryIDSize, EntryID.str()); + + Entries.push_back(*Entry); + } + + return Error::success(); +} + +Expected> +OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset, + StringRef FileName, bool Decompress) { + if (Buf.getBufferSize() < 24) + return errorCodeToError(object_error::parse_failed); + + // Check for magic bytes. + if ((identify_magic(Buf.getBuffer()) != file_magic::offload_bundle) && + (identify_magic(Buf.getBuffer()) != + file_magic::offload_bundle_compressed)) + return errorCodeToError(object_error::parse_failed); + + OffloadBundleFatBin *TheBundle = + new OffloadBundleFatBin(Buf, FileName, Decompress); + + // Read the Bundle Entries + Error Err = + TheBundle->readEntries(Buf.getBuffer(), Decompress ? 0 : SectionOffset); + if (Err) + return errorCodeToError(object_error::parse_failed); + + return std::unique_ptr(TheBundle); +} + +Error OffloadBundleFatBin::extractBundle(const ObjectFile &Source) { + // This will extract all entries in the Bundle + for (OffloadBundleEntry &Entry : Entries) { + + if (Entry.Size == 0) + continue; + + // create output file name. Which should be + // -offset-size.co" + std::string Str = getFileName().str() + "-offset" + itostr(Entry.Offset) + + "-size" + itostr(Entry.Size) + ".co"; + if (Error Err = object::extractCodeObject(Source, Entry.Offset, Entry.Size, + StringRef(Str))) + return Err; + } + + return Error::success(); +} + +Error object::extractOffloadBundleFatBinary( + const ObjectFile &Obj, SmallVectorImpl &Bundles) { + assert((Obj.isELF() || Obj.isCOFF()) && "Invalid file type"); + + // Iterate through Sections until we find an offload_bundle section. + for (SectionRef Sec : Obj.sections()) { + Expected Buffer = Sec.getContents(); + if (!Buffer) + return Buffer.takeError(); + + // If it does not start with the reserved suffix, just skip this section. + if ((llvm::identify_magic(*Buffer) == llvm::file_magic::offload_bundle) || + (llvm::identify_magic(*Buffer) == + llvm::file_magic::offload_bundle_compressed)) { + + uint64_t SectionOffset = 0; + if (Obj.isELF()) { + SectionOffset = ELFSectionRef(Sec).getOffset(); + } else if (Obj.isCOFF()) // TODO: add COFF Support + return createStringError(object_error::parse_failed, + "COFF object files not supported.\n"); + + MemoryBufferRef Contents(*Buffer, Obj.getFileName()); + if (Error Err = extractOffloadBundle(Contents, SectionOffset, + Obj.getFileName(), Bundles)) + return Err; + } + } + return Error::success(); +} + +Error object::extractCodeObject(const ObjectFile &Source, int64_t Offset, + int64_t Size, StringRef OutputFileName) { + Expected> BufferOrErr = + FileOutputBuffer::create(OutputFileName, Size); + + if (!BufferOrErr) + return BufferOrErr.takeError(); + + Expected InputBuffOrErr = Source.getMemoryBufferRef(); + if (Error Err = InputBuffOrErr.takeError()) + return Err; + + std::unique_ptr Buf = std::move(*BufferOrErr); + std::copy(InputBuffOrErr->getBufferStart() + Offset, + InputBuffOrErr->getBufferStart() + Offset + Size, + Buf->getBufferStart()); + if (Error E = Buf->commit()) + return E; + + return Error::success(); +} + +Error object::extractCodeObject(const MemoryBufferRef Buffer, int64_t Offset, + int64_t Size, StringRef OutputFileName) { + Expected> BufferOrErr = + FileOutputBuffer::create(OutputFileName, Size); + if (!BufferOrErr) + return BufferOrErr.takeError(); + + std::unique_ptr Buf = std::move(*BufferOrErr); + std::copy(Buffer.getBufferStart() + Offset, + Buffer.getBufferStart() + Offset + Size, Buf->getBufferStart()); + if (Error E = Buf->commit()) + return E; + + return Error::success(); +} + +// given a file name, offset, and size, extract data into a code object file, +// into file -offset-size.co +Error object::extractOffloadBundleByURI(StringRef URIstr) { + // create a URI object + Expected> UriOrErr( + OffloadBundleURI::createOffloadBundleURI(URIstr, FILE_URI)); + if (!UriOrErr) + return UriOrErr.takeError(); + + OffloadBundleURI &Uri = **UriOrErr; + std::string OutputFile = Uri.FileName.str(); + OutputFile += + "-offset" + itostr(Uri.Offset) + "-size" + itostr(Uri.Size) + ".co"; + + // Create an ObjectFile object from uri.file_uri + auto ObjOrErr = ObjectFile::createObjectFile(Uri.FileName); + if (!ObjOrErr) + return ObjOrErr.takeError(); + + auto Obj = ObjOrErr->getBinary(); + if (Error Err = + object::extractCodeObject(*Obj, Uri.Offset, Uri.Size, OutputFile)) + return Err; + + return Error::success(); +} + +// Utility function to format numbers with commas +static std::string formatWithCommas(unsigned long long Value) { + std::string Num = std::to_string(Value); + int InsertPosition = Num.length() - 3; + while (InsertPosition > 0) { + Num.insert(InsertPosition, ","); + InsertPosition -= 3; + } + return Num; +} + +llvm::Expected> +CompressedOffloadBundle::compress(llvm::compression::Params P, + const llvm::MemoryBuffer &Input, + uint16_t Version, bool Verbose) { + if (!llvm::compression::zstd::isAvailable() && + !llvm::compression::zlib::isAvailable()) + return createStringError(llvm::inconvertibleErrorCode(), + "Compression not supported"); + llvm::Timer HashTimer("Hash Calculation Timer", "Hash calculation time", + OffloadBundlerTimerGroup); + if (Verbose) + HashTimer.startTimer(); + llvm::MD5 Hash; + llvm::MD5::MD5Result Result; + Hash.update(Input.getBuffer()); + Hash.final(Result); + uint64_t TruncatedHash = Result.low(); + if (Verbose) + HashTimer.stopTimer(); + + SmallVector CompressedBuffer; + auto BufferUint8 = llvm::ArrayRef( + reinterpret_cast(Input.getBuffer().data()), + Input.getBuffer().size()); + llvm::Timer CompressTimer("Compression Timer", "Compression time", + OffloadBundlerTimerGroup); + if (Verbose) + CompressTimer.startTimer(); + llvm::compression::compress(P, BufferUint8, CompressedBuffer); + if (Verbose) + CompressTimer.stopTimer(); + + uint16_t CompressionMethod = static_cast(P.format); + + // Store sizes in 64-bit variables first + uint64_t UncompressedSize64 = Input.getBuffer().size(); + uint64_t TotalFileSize64; + + // Calculate total file size based on version + if (Version == 2) { + // For V2, ensure the sizes don't exceed 32-bit limit + if (UncompressedSize64 > std::numeric_limits::max()) + return createStringError(llvm::inconvertibleErrorCode(), + "Uncompressed size exceeds version 2 limit"); + if ((MagicNumber.size() + sizeof(uint32_t) + sizeof(Version) + + sizeof(CompressionMethod) + sizeof(uint32_t) + sizeof(TruncatedHash) + + CompressedBuffer.size()) > std::numeric_limits::max()) + return createStringError(llvm::inconvertibleErrorCode(), + "Total file size exceeds version 2 limit"); + + TotalFileSize64 = MagicNumber.size() + sizeof(uint32_t) + sizeof(Version) + + sizeof(CompressionMethod) + sizeof(uint32_t) + + sizeof(TruncatedHash) + CompressedBuffer.size(); + } else { // Version 3 + TotalFileSize64 = MagicNumber.size() + sizeof(uint64_t) + sizeof(Version) + + sizeof(CompressionMethod) + sizeof(uint64_t) + + sizeof(TruncatedHash) + CompressedBuffer.size(); + } + + SmallVector FinalBuffer; + llvm::raw_svector_ostream OS(FinalBuffer); + OS << MagicNumber; + OS.write(reinterpret_cast(&Version), sizeof(Version)); + OS.write(reinterpret_cast(&CompressionMethod), + sizeof(CompressionMethod)); + + // Write size fields according to version + if (Version == 2) { + uint32_t TotalFileSize32 = static_cast(TotalFileSize64); + uint32_t UncompressedSize32 = static_cast(UncompressedSize64); + OS.write(reinterpret_cast(&TotalFileSize32), + sizeof(TotalFileSize32)); + OS.write(reinterpret_cast(&UncompressedSize32), + sizeof(UncompressedSize32)); + } else { // Version 3 + OS.write(reinterpret_cast(&TotalFileSize64), + sizeof(TotalFileSize64)); + OS.write(reinterpret_cast(&UncompressedSize64), + sizeof(UncompressedSize64)); + } + + OS.write(reinterpret_cast(&TruncatedHash), + sizeof(TruncatedHash)); + OS.write(reinterpret_cast(CompressedBuffer.data()), + CompressedBuffer.size()); + + if (Verbose) { + auto MethodUsed = + P.format == llvm::compression::Format::Zstd ? "zstd" : "zlib"; + double CompressionRate = + static_cast(UncompressedSize64) / CompressedBuffer.size(); + double CompressionTimeSeconds = CompressTimer.getTotalTime().getWallTime(); + double CompressionSpeedMBs = + (UncompressedSize64 / (1024.0 * 1024.0)) / CompressionTimeSeconds; + llvm::errs() << "Compressed bundle format version: " << Version << "\n" + << "Total file size (including headers): " + << formatWithCommas(TotalFileSize64) << " bytes\n" + << "Compression method used: " << MethodUsed << "\n" + << "Compression level: " << P.level << "\n" + << "Binary size before compression: " + << formatWithCommas(UncompressedSize64) << " bytes\n" + << "Binary size after compression: " + << formatWithCommas(CompressedBuffer.size()) << " bytes\n" + << "Compression rate: " + << llvm::format("%.2lf", CompressionRate) << "\n" + << "Compression ratio: " + << llvm::format("%.2lf%%", 100.0 / CompressionRate) << "\n" + << "Compression speed: " + << llvm::format("%.2lf MB/s", CompressionSpeedMBs) << "\n" + << "Truncated MD5 hash: " + << llvm::format_hex(TruncatedHash, 16) << "\n"; + } + + return llvm::MemoryBuffer::getMemBufferCopy( + llvm::StringRef(FinalBuffer.data(), FinalBuffer.size())); +} + +// Use packed structs to avoid padding, such that the structs map the serialized +// format. +LLVM_PACKED_START +union RawCompressedBundleHeader { + struct CommonFields { + uint32_t Magic; + uint16_t Version; + uint16_t Method; + }; + + struct V1Header { + CommonFields Common; + uint32_t UncompressedFileSize; + uint64_t Hash; + }; + + struct V2Header { + CommonFields Common; + uint32_t FileSize; + uint32_t UncompressedFileSize; + uint64_t Hash; + }; + + struct V3Header { + CommonFields Common; + uint64_t FileSize; + uint64_t UncompressedFileSize; + uint64_t Hash; + }; + + CommonFields Common; + V1Header V1; + V2Header V2; + V3Header V3; +}; +LLVM_PACKED_END + +// Helper method to get header size based on version +static size_t getHeaderSize(uint16_t Version) { + switch (Version) { + case 1: + return sizeof(RawCompressedBundleHeader::V1Header); + case 2: + return sizeof(RawCompressedBundleHeader::V2Header); + case 3: + return sizeof(RawCompressedBundleHeader::V3Header); + default: + llvm_unreachable("Unsupported version"); + } +} + +Expected +CompressedOffloadBundle::CompressedBundleHeader::tryParse(StringRef Blob) { + assert(Blob.size() >= sizeof(RawCompressedBundleHeader::CommonFields)); + assert(llvm::identify_magic(Blob) == + llvm::file_magic::offload_bundle_compressed); + + RawCompressedBundleHeader Header; + memcpy(&Header, Blob.data(), std::min(Blob.size(), sizeof(Header))); + + CompressedBundleHeader Normalized; + Normalized.Version = Header.Common.Version; + + size_t RequiredSize = getHeaderSize(Normalized.Version); + + if (Blob.size() < RequiredSize) + return createStringError(inconvertibleErrorCode(), + "Compressed bundle header size too small"); + + switch (Normalized.Version) { + case 1: + Normalized.UncompressedFileSize = Header.V1.UncompressedFileSize; + Normalized.Hash = Header.V1.Hash; + break; + case 2: + Normalized.FileSize = Header.V2.FileSize; + Normalized.UncompressedFileSize = Header.V2.UncompressedFileSize; + Normalized.Hash = Header.V2.Hash; + break; + case 3: + Normalized.FileSize = Header.V3.FileSize; + Normalized.UncompressedFileSize = Header.V3.UncompressedFileSize; + Normalized.Hash = Header.V3.Hash; + break; + default: + return createStringError(inconvertibleErrorCode(), + "Unknown compressed bundle version"); + } + + // Determine compression format + switch (Header.Common.Method) { + case static_cast(compression::Format::Zlib): + case static_cast(compression::Format::Zstd): + Normalized.CompressionFormat = + static_cast(Header.Common.Method); + break; + default: + return createStringError(inconvertibleErrorCode(), + "Unknown compressing method"); + } + + return Normalized; +} + +llvm::Expected> +CompressedOffloadBundle::decompress(const llvm::MemoryBuffer &Input, + bool Verbose) { + StringRef Blob = Input.getBuffer(); + + // Check minimum header size (using V1 as it's the smallest) + if (Blob.size() < sizeof(RawCompressedBundleHeader::CommonFields)) + return llvm::MemoryBuffer::getMemBufferCopy(Blob); + + if (llvm::identify_magic(Blob) != + llvm::file_magic::offload_bundle_compressed) { + if (Verbose) + llvm::errs() << "Uncompressed bundle.\n"; + return llvm::MemoryBuffer::getMemBufferCopy(Blob); + } + + Expected HeaderOrErr = + CompressedBundleHeader::tryParse(Blob); + if (!HeaderOrErr) + return HeaderOrErr.takeError(); + + const CompressedBundleHeader &Normalized = *HeaderOrErr; + unsigned ThisVersion = Normalized.Version; + size_t HeaderSize = getHeaderSize(ThisVersion); + + llvm::compression::Format CompressionFormat = Normalized.CompressionFormat; + + size_t TotalFileSize = Normalized.FileSize.value_or(0); + size_t UncompressedSize = Normalized.UncompressedFileSize; + auto StoredHash = Normalized.Hash; + + llvm::Timer DecompressTimer("Decompression Timer", "Decompression time", + OffloadBundlerTimerGroup); + if (Verbose) + DecompressTimer.startTimer(); + + SmallVector DecompressedData; + StringRef CompressedData = + Blob.substr(HeaderSize, TotalFileSize - HeaderSize); + + if (llvm::Error DecompressionError = llvm::compression::decompress( + CompressionFormat, llvm::arrayRefFromStringRef(CompressedData), + DecompressedData, UncompressedSize)) + return createStringError(inconvertibleErrorCode(), + "Could not decompress embedded file contents: " + + llvm::toString(std::move(DecompressionError))); + + if (Verbose) { + DecompressTimer.stopTimer(); + + double DecompressionTimeSeconds = + DecompressTimer.getTotalTime().getWallTime(); + + // Recalculate MD5 hash for integrity check + llvm::Timer HashRecalcTimer("Hash Recalculation Timer", + "Hash recalculation time", + OffloadBundlerTimerGroup); + HashRecalcTimer.startTimer(); + llvm::MD5 Hash; + llvm::MD5::MD5Result Result; + Hash.update(llvm::ArrayRef(DecompressedData.data(), + DecompressedData.size())); + Hash.final(Result); + uint64_t RecalculatedHash = Result.low(); + HashRecalcTimer.stopTimer(); + bool HashMatch = (StoredHash == RecalculatedHash); + + double CompressionRate = + static_cast(UncompressedSize) / CompressedData.size(); + double DecompressionSpeedMBs = + (UncompressedSize / (1024.0 * 1024.0)) / DecompressionTimeSeconds; + + llvm::errs() << "Compressed bundle format version: " << ThisVersion << "\n"; + if (ThisVersion >= 2) + llvm::errs() << "Total file size (from header): " + << formatWithCommas(TotalFileSize) << " bytes\n"; + llvm::errs() << "Decompression method: " + << (CompressionFormat == llvm::compression::Format::Zlib + ? "zlib" + : "zstd") + << "\n" + << "Size before decompression: " + << formatWithCommas(CompressedData.size()) << " bytes\n" + << "Size after decompression: " + << formatWithCommas(UncompressedSize) << " bytes\n" + << "Compression rate: " + << llvm::format("%.2lf", CompressionRate) << "\n" + << "Compression ratio: " + << llvm::format("%.2lf%%", 100.0 / CompressionRate) << "\n" + << "Decompression speed: " + << llvm::format("%.2lf MB/s", DecompressionSpeedMBs) << "\n" + << "Stored hash: " << llvm::format_hex(StoredHash, 16) << "\n" + << "Recalculated hash: " + << llvm::format_hex(RecalculatedHash, 16) << "\n" + << "Hashes match: " << (HashMatch ? "Yes" : "No") << "\n"; + } + + return llvm::MemoryBuffer::getMemBufferCopy( + llvm::toStringRef(DecompressedData)); +} diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 147f5a1cf63ab..05f36111ce7ea 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -77,6 +77,7 @@ MODULE_PASS("globalopt", GlobalOptPass()) MODULE_PASS("globalsplit", GlobalSplitPass()) MODULE_PASS("heterogeneous-debug-verify", HeterogeneousDebugVerify(TM->getOptLevel())) MODULE_PASS("hipstdpar-interpose-alloc", HipStdParAllocationInterpositionPass()) +MODULE_PASS("hipstdpar-math-fixup", HipStdParMathFixupPass()) MODULE_PASS("hipstdpar-select-accelerator-code", HipStdParAcceleratorCodeSelectionPass()) MODULE_PASS("hotcoldsplit", HotColdSplittingPass()) diff --git a/llvm/lib/Support/Caching.cpp b/llvm/lib/Support/Caching.cpp index 66e540efaca97..2ecdf53701030 100644 --- a/llvm/lib/Support/Caching.cpp +++ b/llvm/lib/Support/Caching.cpp @@ -80,6 +80,7 @@ Expected llvm::localCache(const Twine &CacheNameRef, sys::fs::TempFile TempFile; std::string ModuleName; unsigned Task; + bool Committed = false; CacheStream(std::unique_ptr OS, AddBufferFn AddBuffer, sys::fs::TempFile TempFile, std::string EntryPath, @@ -88,9 +89,10 @@ Expected llvm::localCache(const Twine &CacheNameRef, AddBuffer(std::move(AddBuffer)), TempFile(std::move(TempFile)), ModuleName(ModuleName), Task(Task) {} - ~CacheStream() { - // TODO: Manually commit rather than using non-trivial destructor, - // allowing to replace report_fatal_errors with a return Error. + Error commit() override { + if (Committed) + return Error::success(); + Committed = true; // Make sure the stream is closed before committing it. OS.reset(); @@ -100,10 +102,12 @@ Expected llvm::localCache(const Twine &CacheNameRef, MemoryBuffer::getOpenFile( sys::fs::convertFDToNativeFile(TempFile.FD), ObjectPathName, /*FileSize=*/-1, /*RequiresNullTerminator=*/false); - if (!MBOrErr) - report_fatal_error(Twine("Failed to open new cache file ") + - TempFile.TmpName + ": " + - MBOrErr.getError().message() + "\n"); + if (!MBOrErr) { + std::error_code EC = MBOrErr.getError(); + return createStringError(EC, Twine("Failed to open new cache file ") + + TempFile.TmpName + ": " + + EC.message() + "\n"); + } // On POSIX systems, this will atomically replace the destination if // it already exists. We try to emulate this on Windows, but this may @@ -118,7 +122,10 @@ Expected llvm::localCache(const Twine &CacheNameRef, E = handleErrors(std::move(E), [&](const ECError &E) -> Error { std::error_code EC = E.convertToErrorCode(); if (EC != errc::permission_denied) - return errorCodeToError(EC); + return createStringError( + EC, Twine("Failed to rename temporary file ") + + TempFile.TmpName + " to " + ObjectPathName + ": " + + EC.message() + "\n"); auto MBCopy = MemoryBuffer::getMemBufferCopy((*MBOrErr)->getBuffer(), ObjectPathName); @@ -131,11 +138,22 @@ Expected llvm::localCache(const Twine &CacheNameRef, }); if (E) - report_fatal_error(Twine("Failed to rename temporary file ") + - TempFile.TmpName + " to " + ObjectPathName + ": " + - toString(std::move(E)) + "\n"); + return E; AddBuffer(Task, ModuleName, std::move(*MBOrErr)); + return Error::success(); + } + + ~CacheStream() { + // In Debug builds, try to track down places where commit() was not + // called before destruction. + assert(Committed); + // In Release builds, fall back to the previous behaviour of committing + // during destruction and reporting errors with report_fatal_error. + if (Committed) + return; + if (Error Err = commit()) + report_fatal_error(Twine(toString(std::move(Err)))); } }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 1c8e0804ce6ca..25a8615306465 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -65,6 +65,7 @@ ModulePass *createAMDGPULowerBufferFatPointersPass(); FunctionPass *createSIModeRegisterPass(); FunctionPass *createGCNPreRAOptimizationsPass(); FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass(); +ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin { AMDGPUSimplifyLibCallsPass() {} @@ -234,6 +235,9 @@ extern char &GCNRegPressurePrinterID; void initializeAMDGPUPreloadKernArgPrologLegacyPass(PassRegistry &); extern char &AMDGPUPreloadKernArgPrologLegacyID; +void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &); +extern char &AMDGPUPreloadKernelArgumentsLegacyID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); @@ -339,6 +343,16 @@ class AMDGPUAttributorPass : public PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; +class AMDGPUPreloadKernelArgumentsPass + : public PassInfoMixin { + const TargetMachine &TM; + +public: + explicit AMDGPUPreloadKernelArgumentsPass(const TargetMachine &TM) : TM(TM) {} + + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + class AMDGPUAnnotateUniformValuesPass : public PassInfoMixin { public: @@ -359,6 +373,15 @@ extern char &AMDGPUPrintfRuntimeBindingID; void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &); extern char &AMDGPUResourceUsageAnalysisID; +struct AMDGPUExpandFeaturePredicatesPass + : PassInfoMixin { + const AMDGPUTargetMachine &TM; + AMDGPUExpandFeaturePredicatesPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + + static bool isRequired() { return true; } +}; + struct AMDGPUPrintfRuntimeBindingPass : PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); @@ -467,6 +490,9 @@ void initializeAMDGPUSetWavePriorityPass(PassRegistry &); void initializeGCNRewritePartialRegUsesPass(llvm::PassRegistry &); extern char &GCNRewritePartialRegUsesID; +void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &); +extern char &AMDGPUWaitSGPRHazardsLegacyID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 7ad6720b8001a..b312a22fb218d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1088,6 +1088,12 @@ def FeaturePrngInst : SubtargetFeature<"prng-inst", "Has v_prng_b32 instruction" >; +def FeatureBVHDualAndBVH8Insts : SubtargetFeature<"bvh-dual-bvh-8-insts", + "HasBVHDualAndBVH8Insts", + "true", + "Has image_bvh_dual_intersect_ray and image_bvh8_intersect_ray instructions" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -1245,6 +1251,12 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts", "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32" >; +def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", + "HasVMemToLDSLoad", + "true", + "The platform has memory to lds instructions (global_load w/lds bit set, buffer_load w/lds bit set or global_load_lds. This does not include scratch_load_lds." +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -1315,7 +1327,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS, - FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder + FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad ] >; @@ -1339,7 +1351,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureDefaultComponentZero, FeatureMaxHardClauseLength63, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, - FeatureVmemWriteVgprInOrder + FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad ] >; @@ -1854,7 +1866,8 @@ def FeatureISAVersion12 : FeatureSet< FeatureDPPSrc1SGPR, FeatureMaxHardClauseLength32, Feature1_5xVGPRs, - FeatureMemoryAtomicFAddF32DenormalSupport + FeatureMemoryAtomicFAddF32DenormalSupport, + FeatureBVHDualAndBVH8Insts ]>; def FeatureISAVersion12_Generic: FeatureSet< @@ -2507,6 +2520,9 @@ def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">, def HasPrngInst : Predicate<"Subtarget->hasPrngInst()">, AssemblerPredicate<(all_of FeaturePrngInst)>; +def HasBVHDualAndBVH8Insts : Predicate<"Subtarget->hasBVHDualAndBVH8Insts()">, + AssemblerPredicate<(all_of FeatureBVHDualAndBVH8Insts)>; + def HasFP8ConversionScaleInsts : Predicate<"Subtarget->hasFP8ConversionScaleInsts()">, AssemblerPredicate<(all_of FeatureFP8ConversionScaleInsts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 031d8f0560ff2..07c3319f51223 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1216,6 +1216,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; ProgInfo.MemOrdered = 1; + ProgInfo.FwdProgress = 1; } // 0 = X, 1 = XY, 2 = XYZ diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 4d35486577555..466643866d43d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -29,10 +29,6 @@ void initializeCycleInfoWrapperPassPass(PassRegistry &); using namespace llvm; -static cl::opt KernargPreloadCount( - "amdgpu-kernarg-preload-count", - cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0)); - static cl::opt IndirectCallSpecializationThreshold( "amdgpu-indirect-call-specialization-threshold", cl::desc( @@ -211,7 +207,7 @@ class AMDGPUInformationCache : public InformationCache { getWavesPerEU(const Function &F, std::pair FlatWorkGroupSize) { const GCNSubtarget &ST = TM.getSubtarget(F); - return ST.getWavesPerEU(F, FlatWorkGroupSize); + return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F); } std::optional> @@ -232,7 +228,8 @@ class AMDGPUInformationCache : public InformationCache { std::pair WavesPerEU, std::pair FlatWorkGroupSize) { const GCNSubtarget &ST = TM.getSubtarget(F); - return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize); + return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize, + getLDSSize(F)); } unsigned getMaxWavesPerEU(const Function &F) { @@ -257,6 +254,14 @@ class AMDGPUInformationCache : public InformationCache { return Status; } + /// Returns the minimum amount of LDS space used by a workgroup running + /// function \p F. + static unsigned getLDSSize(const Function &F) { + return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", + {0, UINT32_MAX}, true) + .first; + } + /// Get the constant access bitmap for \p C. uint8_t getConstantAccess(const Constant *C, SmallPtrSetImpl &Visited) { @@ -1110,47 +1115,25 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { Function *F = getAssociatedFunction(); auto &InfoCache = static_cast(A.getInfoCache()); - auto TakeRange = [&](std::pair R) { - auto [Min, Max] = R; - ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); - IntegerRangeState RangeState(Range); - clampStateAndIndicateChange(this->getState(), RangeState); - indicateOptimisticFixpoint(); - }; - - std::pair MaxWavesPerEURange{ - 1U, InfoCache.getMaxWavesPerEU(*F)}; - // If the attribute exists, we will honor it if it is not the default. if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) { + std::pair MaxWavesPerEURange{ + 1U, InfoCache.getMaxWavesPerEU(*F)}; if (*Attr != MaxWavesPerEURange) { - TakeRange(*Attr); + auto [Min, Max] = *Attr; + ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); + IntegerRangeState RangeState(Range); + this->getState() = RangeState; + indicateOptimisticFixpoint(); return; } } - // Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the - // calculation of waves per EU involves flat work group size, we can't - // simply use an assumed flat work group size as a start point, because the - // update of flat work group size is in an inverse direction of waves per - // EU. However, we can still do something if it is an entry function. Since - // an entry function is a terminal node, and flat work group size either - // from attribute or default will be used anyway, we can take that value and - // calculate the waves per EU based on it. This result can't be updated by - // no means, but that could still allow us to propagate it. - if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) { - std::pair FlatWorkGroupSize; - if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) - FlatWorkGroupSize = *Attr; - else - FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F); - TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange, - FlatWorkGroupSize)); - } + if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) + indicatePessimisticFixpoint(); } ChangeStatus updateImpl(Attributor &A) override { - auto &InfoCache = static_cast(A.getInfoCache()); ChangeStatus Change = ChangeStatus::UNCHANGED; auto CheckCallSite = [&](AbstractCallSite CS) { @@ -1159,24 +1142,21 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName() << "->" << Func->getName() << '\n'); - const auto *CallerInfo = A.getAAFor( + const auto *CallerAA = A.getAAFor( *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); - const auto *AssumedGroupSize = A.getAAFor( - *this, IRPosition::function(*Func), DepClassTy::REQUIRED); - if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState() || - !AssumedGroupSize->isValidState()) + if (!CallerAA || !CallerAA->isValidState()) return false; - unsigned Min, Max; - std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU( - *Caller, - {CallerInfo->getAssumed().getLower().getZExtValue(), - CallerInfo->getAssumed().getUpper().getZExtValue() - 1}, - {AssumedGroupSize->getAssumed().getLower().getZExtValue(), - AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1}); - ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1)); - IntegerRangeState CallerRangeState(CallerRange); - Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState); + ConstantRange Assumed = getAssumed(); + unsigned Min = std::max(Assumed.getLower().getZExtValue(), + CallerAA->getAssumed().getLower().getZExtValue()); + unsigned Max = std::max(Assumed.getUpper().getZExtValue(), + CallerAA->getAssumed().getUpper().getZExtValue()); + ConstantRange Range(APInt(32, Min), APInt(32, Max)); + IntegerRangeState RangeState(Range); + getState() = RangeState; + Change |= getState() == Assumed ? ChangeStatus::UNCHANGED + : ChangeStatus::CHANGED; return true; }; @@ -1315,21 +1295,6 @@ struct AAAMDGPUNoAGPR const char AAAMDGPUNoAGPR::ID = 0; -static void addPreloadKernArgHint(Function &F, TargetMachine &TM) { - const GCNSubtarget &ST = TM.getSubtarget(F); - for (unsigned I = 0; - I < F.arg_size() && - I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs()); - ++I) { - Argument &Arg = *F.getArg(I); - // Check for incompatible attributes. - if (Arg.hasByRefAttr() || Arg.hasNestAttr()) - break; - - Arg.addAttr(Attribute::InReg); - } -} - static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, AMDGPUAttributorOptions Options) { SetVector Functions; @@ -1385,8 +1350,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, if (!AMDGPU::isEntryFunctionCC(CC)) { A.getOrCreateAAFor(IRPosition::function(*F)); A.getOrCreateAAFor(IRPosition::function(*F)); - } else if (CC == CallingConv::AMDGPU_KERNEL) { - addPreloadKernArgHint(*F, TM); } for (auto &I : instructions(F)) { @@ -1406,8 +1369,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, } } - ChangeStatus Change = A.run(); - return Change == ChangeStatus::CHANGED; + return A.run() == ChangeStatus::CHANGED; } class AMDGPUAttributorLegacy : public ModulePass { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 80969fce3d77f..e891fdba4e03e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -91,7 +91,15 @@ def CSR_AMDGPU_AGPRs : CalleeSavedRegs< >; def CSR_AMDGPU_SGPRs : CalleeSavedRegs< - (sequence "SGPR%u", 30, 105) + // Ensure that s30-s31 (return address), s32 (stack pointer), s33 (frame pointer), + // and s34 (base pointer) are callee-saved. The striped layout starts from s40, + // with a stripe width of 8. The last stripe is 10 wide instead of 8, to avoid + // ending with a 2-wide stripe. + (add (sequence "SGPR%u", 30, 39), + (sequence "SGPR%u", 48, 55), + (sequence "SGPR%u", 64, 71), + (sequence "SGPR%u", 80, 87), + (sequence "SGPR%u", 96, 105)) >; def CSR_AMDGPU_SI_Gfx_SGPRs : CalleeSavedRegs< diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp new file mode 100644 index 0000000000000..cd9e29a4e7d67 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -0,0 +1,167 @@ +//===- AMDGPUExpandFeaturePredicates.cpp - Feature Predicate Expander Pass ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file implements a pass that deals with expanding AMDGCN generic feature +// predicates into target specific quantities / sequences. In this context, a +// generic feature predicate is an implementation detail global variable that +// is inserted by the FE as a consequence of using either the __builtin_cpu_is +// or the __builtin_amdgcn_is_invocable special builtins on an abstract target +// (AMDGCNSPIRV). These placeholder globals are used to guide target specific +// lowering, once the concrete target is known, by way of constant folding their +// value all the way into a terminator (i.e. a controlled block) or into a no +// live use scenario. We hard fail if the folding fails, to avoid obtuse BE +// errors or opaque run time errors. This pass should run as early as possible / +// immediately after Clang CodeGen, so that the optimisation pipeline and the BE +// operate with concrete target data. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" + +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/Local.h" + +#include +#include + +using namespace llvm; + +namespace { +template void collectUsers(Value *V, C &Container) { + assert(V && "Must pass an existing Value!"); + + for (auto &&U : V->users()) + if (auto *I = dyn_cast(U)) + Container.insert(Container.end(), I); +} + +inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) { + const bool IsFeature = P->getName().starts_with("llvm.amdgcn.has"); + const size_t Offset = + IsFeature ? sizeof("llvm.amdgcn.has") : sizeof("llvm.amdgcn.is"); + + std::string PV = P->getName().substr(Offset).str(); + if (IsFeature) { + size_t Dx = PV.find(','); + while (Dx != std::string::npos) { + PV.insert(++Dx, {'+'}); + + Dx = PV.find(',', Dx); + } + PV.insert(PV.cbegin(), '+'); + } + + Type *PTy = P->getValueType(); + P->setLinkage(GlobalValue::PrivateLinkage); + P->setExternallyInitialized(false); + + if (IsFeature) + P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures(PV))); + else + P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU())); +} + +std::pair +unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) { + std::string W; + raw_string_ostream OS(W); + + OS << "Impossible to constant fold feature predicate: " << *P << " used by " + << *NoFold << ", please simplify.\n"; + + Caller->getContext().diagnose( + DiagnosticInfoUnsupported(*Caller, W, NoFold->getDebugLoc(), DS_Error)); + + return {PreservedAnalyses::none(), false}; +} + +std::pair +handlePredicate(const GCNSubtarget &ST, FunctionAnalysisManager &FAM, + SmallPtrSet &Predicated, GlobalVariable *P) { + setPredicate(ST, P); + + SmallPtrSet ToFold; + collectUsers(P, ToFold); + + if (ToFold.empty()) + return {PreservedAnalyses::all(), true}; + + do { + Instruction *I = *ToFold.begin(); + ToFold.erase(I); + + I->dropDroppableUses(); + + Function *F = I->getParent()->getParent(); + auto &DT = FAM.getResult(*F); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + + if (auto *C = ConstantFoldInstruction(I, P->getDataLayout())) { + collectUsers(I, ToFold); + I->replaceAllUsesWith(C); + I->eraseFromParent(); + continue; + } else if (I->isTerminator() && + ConstantFoldTerminator(I->getParent(), true, nullptr, &DTU)) { + Predicated.insert(F); + + continue; + } + + return unfoldableFound(I->getParent()->getParent(), P, I); + } while (!ToFold.empty()); + + return {PreservedAnalyses::none(), true}; +} +} // Unnamed namespace. + +PreservedAnalyses +AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) { + if (M.empty()) + return PreservedAnalyses::all(); + + SmallVector Predicates; + for (auto &&G : M.globals()) { + if (!G.isDeclaration() || !G.hasName()) + continue; + if (G.getName().starts_with("llvm.amdgcn.")) + Predicates.push_back(&G); + } + + if (Predicates.empty()) + return PreservedAnalyses::all(); + + const auto &ST = TM.getSubtarget( + *find_if(M, [](auto &&F) { return !F.isIntrinsic(); })); + + auto &FAM = MAM.getResult(M).getManager(); + SmallPtrSet Predicated; + auto Ret = PreservedAnalyses::all(); + for (auto &&P : Predicates) { + auto R = handlePredicate(ST, FAM, Predicated, P); + + if (!R.second) + break; + + Ret.intersect(R.first); + } + + for (auto &&P : Predicates) + P->eraseFromParent(); + for (auto &&F : Predicated) + removeUnreachableBlocks(*F); + + return Ret; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 8e90754103ff1..e93a401ee20fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2665,8 +2665,20 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) { // We need to handle this here because tablegen doesn't support matching // instructions with multiple outputs. -void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) { - unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32; +void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) { + unsigned Opc; + switch (IntrID) { + case Intrinsic::amdgcn_ds_bvh_stack_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: + Opc = AMDGPU::DS_BVH_STACK_RTN_B32; + break; + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: + Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32; + break; + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: + Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64; + break; + } SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4), N->getOperand(5), N->getOperand(0)}; @@ -2830,7 +2842,10 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { return; } case Intrinsic::amdgcn_ds_bvh_stack_rtn: - SelectDSBvhStackIntrinsic(N); + case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: + SelectDSBvhStackIntrinsic(N, IntrID); return; case Intrinsic::amdgcn_init_whole_wave: CurDAG->getMachineFunction() diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 7dcd208a9cdd4..f3b9364fdb92b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -267,7 +267,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { void SelectFMAD_FMA(SDNode *N); void SelectFP_EXTEND(SDNode *N); void SelectDSAppendConsume(SDNode *N, unsigned IntrID); - void SelectDSBvhStackIntrinsic(SDNode *N); + void SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID); void SelectDS_GWS(SDNode *N, unsigned IntrID); void SelectInterpP1F16(SDNode *N); void SelectINTRINSIC_W_CHAIN(SDNode *N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index cca9fa72d0ca5..5ea0c895b9270 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -6000,6 +6000,7 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, // TODO: Handle more intrinsics switch (IntrinsicID) { case Intrinsic::amdgcn_cubeid: + case Intrinsic::amdgcn_cvt_off_f32_i4: return true; case Intrinsic::amdgcn_frexp_mant: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index bac3bb5fde7b0..c3f35ef5b2742 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -18,6 +18,7 @@ #include "AMDGPUTargetTransformInfo.h" #include "GCNSubtarget.h" #include "llvm/ADT/FloatingPointMode.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include @@ -481,6 +482,98 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC, return false; } +static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old, + Function &NewCallee, ArrayRef Ops) { + SmallVector OpBundles; + Old.getOperandBundlesAsDefs(OpBundles); + + CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles); + NewCall->takeName(&Old); + return NewCall; +} + +Instruction * +GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC, + IntrinsicInst &II) const { + const auto IID = II.getIntrinsicID(); + assert(IID == Intrinsic::amdgcn_readlane || + IID == Intrinsic::amdgcn_readfirstlane || + IID == Intrinsic::amdgcn_permlane64); + + Instruction *OpInst = dyn_cast(II.getOperand(0)); + + // Only do this if both instructions are in the same block + // (so the exec mask won't change) and the readlane is the only user of its + // operand. + if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent()) + return nullptr; + + const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane); + + // If this is a readlane, check that the second operand is a constant, or is + // defined before OpInst so we know it's safe to move this intrinsic higher. + Value *LaneID = nullptr; + if (IsReadLane) { + LaneID = II.getOperand(1); + + // readlane take an extra operand for the lane ID, so we must check if that + // LaneID value can be used at the point where we want to move the + // intrinsic. + if (auto *LaneIDInst = dyn_cast(LaneID)) { + if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst)) + return nullptr; + } + } + + // Hoist the intrinsic (II) through OpInst. + // + // (II (OpInst x)) -> (OpInst (II x)) + const auto DoIt = [&](unsigned OpIdx, + Function *NewIntrinsic) -> Instruction * { + SmallVector Ops{OpInst->getOperand(OpIdx)}; + if (IsReadLane) + Ops.push_back(LaneID); + + // Rewrite the intrinsic call. + CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops); + + // Rewrite OpInst so it takes the result of the intrinsic now. + Instruction &NewOp = *OpInst->clone(); + NewOp.setOperand(OpIdx, NewII); + return &NewOp; + }; + + // TODO(?): Should we do more with permlane64? + if (IID == Intrinsic::amdgcn_permlane64 && !isa(OpInst)) + return nullptr; + + if (isa(OpInst)) + return DoIt(0, II.getCalledFunction()); + + if (isa(OpInst)) { + Value *Src = OpInst->getOperand(0); + Type *SrcTy = Src->getType(); + if (!isTypeLegal(SrcTy)) + return nullptr; + + Function *Remangled = + Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy}); + return DoIt(0, Remangled); + } + + // We can also hoist through binary operators if the other operand is uniform. + if (isa(OpInst)) { + // FIXME: If we had access to UniformityInfo here we could just check + // if the operand is uniform. + if (isTriviallyUniform(OpInst->getOperandUse(0))) + return DoIt(1, II.getCalledFunction()); + if (isTriviallyUniform(OpInst->getOperandUse(1))) + return DoIt(0, II.getCalledFunction()); + } + + return nullptr; +} + std::optional GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); @@ -718,6 +811,29 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } + case Intrinsic::amdgcn_cvt_off_f32_i4: { + Value* Arg = II.getArgOperand(0); + Type *Ty = II.getType(); + + if (isa(Arg)) + return IC.replaceInstUsesWith(II, PoisonValue::get(Ty)); + + if(IC.getSimplifyQuery().isUndefValue(Arg)) + return IC.replaceInstUsesWith(II, Constant::getNullValue(Ty)); + + ConstantInt *CArg = dyn_cast(II.getArgOperand(0)); + if (!CArg) + break; + + // Tabulated 0.0625 * (sext (CArg & 0xf)). + constexpr size_t ResValsSize = 16; + static constexpr float ResVals[ResValsSize] = { + 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375, + -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625}; + Constant *Res = + ConstantFP::get(Ty, ResVals[CArg->getZExtValue() & (ResValsSize - 1)]); + return IC.replaceInstUsesWith(II, Res); + } case Intrinsic::amdgcn_ubfe: case Intrinsic::amdgcn_sbfe: { // Decompose simple cases into standard shifts. @@ -1128,6 +1244,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { simplifyDemandedLaneMaskArg(IC, II, 1)) return &II; + if (IID != Intrinsic::amdgcn_ds_bpermute) { + if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II)) + return Res; + } + return std::nullopt; } case Intrinsic::amdgcn_writelane: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 40eaba2c09209..51c2d53c34586 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2261,7 +2261,21 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic( Register Data1 = MI.getOperand(5).getReg(); unsigned Offset = MI.getOperand(6).getImm(); - auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0) + unsigned Opc; + switch (cast(MI).getIntrinsicID()) { + case Intrinsic::amdgcn_ds_bvh_stack_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: + Opc = AMDGPU::DS_BVH_STACK_RTN_B32; + break; + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: + Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32; + break; + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: + Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64; + break; + } + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) .addDef(Dst1) .addUse(Addr) .addUse(Data0) @@ -2316,6 +2330,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( } break; case Intrinsic::amdgcn_ds_bvh_stack_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: return selectDSBvhStackIntrinsic(I); case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: @@ -3337,7 +3354,8 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( } bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { - assert(!AMDGPU::isGFX12Plus(STI)); + if (!Subtarget->hasVMemToLDSLoad()) + return false; unsigned Opc; unsigned Size = MI.getOperand(3).getImm(); @@ -3473,6 +3491,9 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { } bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ + if (!Subtarget->hasVMemToLDSLoad()) + return false; + unsigned Opc; unsigned Size = MI.getOperand(3).getImm(); @@ -3564,11 +3585,14 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } -bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ - MI.setDesc(TII.get(MI.getOperand(1).getImm())); - MI.removeOperand(1); +bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic( + MachineInstr &MI) const { + unsigned OpcodeOpIdx = + MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3; + MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm())); + MI.removeOperand(OpcodeOpIdx); MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); - return true; + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); } // FIXME: This should be removed and let the patterns select. We just need the @@ -4082,8 +4106,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { assert(Intr && "not an image intrinsic with image pseudo"); return selectImageIntrinsic(I, Intr); } - case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: - return selectBVHIntrinsic(I); + case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: + case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: + case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY: + return selectBVHIntersectRayIntrinsic(I); case AMDGPU::G_SBFX: case AMDGPU::G_UBFX: return selectG_SBFX_UBFX(I); @@ -6107,13 +6133,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : (int64_t)SISrcMods::DST_OP_SEL); } @@ -6122,13 +6148,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)(SISrcMods::OP_SEL_0) : 0); } @@ -6157,8 +6183,9 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0( void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0); + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) + ? (int64_t)SISrcMods::DST_OP_SEL + : 0); } void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index b0d2a73fe31d2..5ba755c75afd5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -146,7 +146,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; bool selectBufferLoadLds(MachineInstr &MI) const; bool selectGlobalLoadLds(MachineInstr &MI) const; - bool selectBVHIntrinsic(MachineInstr &I) const; + bool selectBVHIntersectRayIntrinsic(MachineInstr &I) const; bool selectSMFMACIntrin(MachineInstr &I) const; bool selectPermlaneSwapIntrin(MachineInstr &I, Intrinsic::ID IntrID) const; bool selectWaveAddress(MachineInstr &I) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index f4e651ec477d3..699eab123ad3d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -77,8 +77,6 @@ class LiveRegOptimizer { const GCNSubtarget &ST; /// The scalar type to convert to Type *const ConvertToScalar; - /// The set of visited Instructions - SmallPtrSet Visited; /// Map of Value -> Converted Value ValueToValueMap ValMap; /// Map of containing conversions from Optimal Type -> Original Type per BB. @@ -248,6 +246,7 @@ bool LiveRegOptimizer::optimizeLiveType( SmallPtrSet PhiNodes; SmallPtrSet Defs; SmallPtrSet Uses; + SmallPtrSet Visited; Worklist.push_back(cast(I)); while (!Worklist.empty()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index e9e47eaadd557..638763872f0b8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1042,8 +1042,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC); if (ST.hasCvtPkF16F32Inst()) - FPTruncActions.legalFor( - {{S32, S64}, {S16, S32}, {V2S16, V2S32}, {V2S16, V2S64}}); + FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}}); else FPTruncActions.legalFor({{S32, S64}, {S16, S32}}); FPTruncActions.scalarize(0).lower(); @@ -7020,14 +7019,13 @@ bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI, return true; } -bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, - MachineIRBuilder &B) const { +bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic( + MachineInstr &MI, MachineIRBuilder &B) const { MachineRegisterInfo &MRI = *B.getMRI(); const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); const LLT V2S16 = LLT::fixed_vector(2, 16); const LLT V3S32 = LLT::fixed_vector(3, 32); - Register DstReg = MI.getOperand(0).getReg(); Register NodePtr = MI.getOperand(2).getReg(); Register RayExtent = MI.getOperand(3).getReg(); @@ -7035,7 +7033,6 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, Register RayDir = MI.getOperand(5).getReg(); Register RayInvDir = MI.getOperand(6).getReg(); Register TDescr = MI.getOperand(7).getReg(); - if (!ST.hasGFX10_AEncoding()) { DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), "intrinsic not supported on subtarget", @@ -7043,7 +7040,6 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, B.getMF().getFunction().getContext().diagnose(BadIntrin); return false; } - const bool IsGFX11 = AMDGPU::isGFX11(ST); const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST); @@ -7054,7 +7050,6 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; const bool UseNSA = IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize()); - const unsigned BaseOpcodes[2][2] = { {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, @@ -7074,7 +7069,6 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, NumVDataDwords, NumVAddrDwords); } assert(Opcode != -1); - SmallVector Ops; if (UseNSA && IsGFX11Plus) { auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { @@ -7083,11 +7077,9 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); Ops.push_back(Merged.getReg(0)); }; - Ops.push_back(NodePtr); Ops.push_back(RayExtent); packLanes(RayOrigin); - if (IsA16) { auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); @@ -7119,14 +7111,12 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, Ops.push_back(NodePtr); } Ops.push_back(RayExtent); - auto packLanes = [&Ops, &S32, &B](Register Src) { auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); Ops.push_back(Unmerge.getReg(0)); Ops.push_back(Unmerge.getReg(1)); Ops.push_back(Unmerge.getReg(2)); }; - packLanes(RayOrigin); if (IsA16) { auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); @@ -7148,7 +7138,6 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, packLanes(RayInvDir); } } - if (!UseNSA) { // Build a single vector containing all the operands so far prepared. LLT OpTy = LLT::fixed_vector(Ops.size(), 32); @@ -7156,18 +7145,69 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, Ops.clear(); Ops.push_back(MergedOps); } - - auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) - .addDef(DstReg) - .addImm(Opcode); - + auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY) + .addDef(DstReg) + .addImm(Opcode); for (Register R : Ops) { MIB.addUse(R); } - MIB.addUse(TDescr) .addImm(IsA16 ? 1 : 0) .cloneMemRefs(MI); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeBVHDualOrBVH8IntersectRayIntrinsic( + MachineInstr &MI, MachineIRBuilder &B) const { + const LLT S32 = LLT::scalar(32); + const LLT V2S32 = LLT::fixed_vector(2, 32); + + Register DstReg = MI.getOperand(0).getReg(); + Register DstOrigin = MI.getOperand(1).getReg(); + Register DstDir = MI.getOperand(2).getReg(); + Register NodePtr = MI.getOperand(4).getReg(); + Register RayExtent = MI.getOperand(5).getReg(); + Register InstanceMask = MI.getOperand(6).getReg(); + Register RayOrigin = MI.getOperand(7).getReg(); + Register RayDir = MI.getOperand(8).getReg(); + Register Offsets = MI.getOperand(9).getReg(); + Register TDescr = MI.getOperand(10).getReg(); + + if (!ST.hasBVHDualAndBVH8Insts()) { + DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), + "intrinsic not supported on subtarget", + MI.getDebugLoc()); + B.getMF().getFunction().getContext().diagnose(BadIntrin); + return false; + } + + bool IsBVH8 = cast(MI).getIntrinsicID() == + Intrinsic::amdgcn_image_bvh8_intersect_ray; + const unsigned NumVDataDwords = 10; + const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12; + int Opcode = AMDGPU::getMIMGOpcode( + IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY + : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY, + AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords); + assert(Opcode != -1); + + auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr( + V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)}); + + B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY + : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY) + .addDef(DstReg) + .addDef(DstOrigin) + .addDef(DstDir) + .addImm(Opcode) + .addUse(NodePtr) + .addUse(RayExtentInstanceMaskVec.getReg(0)) + .addUse(RayOrigin) + .addUse(RayDir) + .addUse(Offsets) + .addUse(TDescr) + .cloneMemRefs(MI); MI.eraseFromParent(); return true; @@ -7525,7 +7565,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_rsq_clamp: return legalizeRsqClampIntrinsic(MI, MRI, B); case Intrinsic::amdgcn_image_bvh_intersect_ray: - return legalizeBVHIntrinsic(MI, B); + return legalizeBVHIntersectRayIntrinsic(MI, B); + case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: + case Intrinsic::amdgcn_image_bvh8_intersect_ray: + return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B); case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 86c15197805d2..f13d95dc31e3d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -205,11 +205,15 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const; + bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, + MachineIRBuilder &B) const; + + bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, + MachineIRBuilder &B) const; + bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const; - bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const; - bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index e9d009baa20af..d2dd6869f1070 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -27,230 +27,6 @@ using namespace llvm; namespace { -class PreloadKernelArgInfo { -private: - Function &F; - const GCNSubtarget &ST; - unsigned NumFreeUserSGPRs; - - enum HiddenArg : unsigned { - HIDDEN_BLOCK_COUNT_X, - HIDDEN_BLOCK_COUNT_Y, - HIDDEN_BLOCK_COUNT_Z, - HIDDEN_GROUP_SIZE_X, - HIDDEN_GROUP_SIZE_Y, - HIDDEN_GROUP_SIZE_Z, - HIDDEN_REMAINDER_X, - HIDDEN_REMAINDER_Y, - HIDDEN_REMAINDER_Z, - END_HIDDEN_ARGS - }; - - // Stores information about a specific hidden argument. - struct HiddenArgInfo { - // Offset in bytes from the location in the kernearg segment pointed to by - // the implicitarg pointer. - uint8_t Offset; - // The size of the hidden argument in bytes. - uint8_t Size; - // The name of the hidden argument in the kernel signature. - const char *Name; - }; - - static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { - {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"}, - {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"}, - {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"}, - {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"}, - {22, 2, "_hidden_remainder_z"}}; - - static HiddenArg getHiddenArgFromOffset(unsigned Offset) { - for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) - if (HiddenArgs[I].Offset == Offset) - return static_cast(I); - - return END_HIDDEN_ARGS; - } - - static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { - if (HA < END_HIDDEN_ARGS) - return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8); - - llvm_unreachable("Unexpected hidden argument."); - } - - static const char *getHiddenArgName(HiddenArg HA) { - if (HA < END_HIDDEN_ARGS) { - return HiddenArgs[HA].Name; - } - llvm_unreachable("Unexpected hidden argument."); - } - - // Clones the function after adding implicit arguments to the argument list - // and returns the new updated function. Preloaded implicit arguments are - // added up to and including the last one that will be preloaded, indicated by - // LastPreloadIndex. Currently preloading is only performed on the totality of - // sequential data from the kernarg segment including implicit (hidden) - // arguments. This means that all arguments up to the last preloaded argument - // will also be preloaded even if that data is unused. - Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { - FunctionType *FT = F.getFunctionType(); - LLVMContext &Ctx = F.getParent()->getContext(); - SmallVector FTypes(FT->param_begin(), FT->param_end()); - for (unsigned I = 0; I <= LastPreloadIndex; ++I) - FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I))); - - FunctionType *NFT = - FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg()); - Function *NF = - Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName()); - - NF->copyAttributesFrom(&F); - NF->copyMetadata(&F, 0); - NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat); - - F.getParent()->getFunctionList().insert(F.getIterator(), NF); - NF->takeName(&F); - NF->splice(NF->begin(), &F); - - Function::arg_iterator NFArg = NF->arg_begin(); - for (Argument &Arg : F.args()) { - Arg.replaceAllUsesWith(&*NFArg); - NFArg->takeName(&Arg); - ++NFArg; - } - - AttrBuilder AB(Ctx); - AB.addAttribute(Attribute::InReg); - AB.addAttribute("amdgpu-hidden-argument"); - AttributeList AL = NF->getAttributes(); - for (unsigned I = 0; I <= LastPreloadIndex; ++I) { - AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB); - NFArg++->setName(getHiddenArgName(HiddenArg(I))); - } - - NF->setAttributes(AL); - F.replaceAllUsesWith(NF); - F.setCallingConv(CallingConv::C); - - return NF; - } - -public: - PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { - setInitialFreeUserSGPRsCount(); - } - - // Returns the maximum number of user SGPRs that we have available to preload - // arguments. - void setInitialFreeUserSGPRsCount() { - GCNUserSGPRUsageInfo UserSGPRInfo(F, ST); - NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs(); - } - - bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset, - uint64_t LastExplicitArgOffset) { - // Check if this argument may be loaded into the same register as the - // previous argument. - if (ArgOffset - LastExplicitArgOffset < 4 && - !isAligned(Align(4), ArgOffset)) - return true; - - // Pad SGPRs for kernarg alignment. - ArgOffset = alignDown(ArgOffset, 4); - unsigned Padding = ArgOffset - LastExplicitArgOffset; - unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; - unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4; - if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs) - return false; - - NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs); - return true; - } - - // Try to allocate SGPRs to preload implicit kernel arguments. - void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, - uint64_t LastExplicitArgOffset, - IRBuilder<> &Builder) { - Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists( - F.getParent(), Intrinsic::amdgcn_implicitarg_ptr); - if (!ImplicitArgPtr) - return; - - const DataLayout &DL = F.getParent()->getDataLayout(); - // Pair is the load and the load offset. - SmallVector, 4> ImplicitArgLoads; - for (auto *U : ImplicitArgPtr->users()) { - Instruction *CI = dyn_cast(U); - if (!CI || CI->getParent()->getParent() != &F) - continue; - - for (auto *U : CI->users()) { - int64_t Offset = 0; - auto *Load = dyn_cast(U); // Load from ImplicitArgPtr? - if (!Load) { - if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) - continue; - - Load = dyn_cast(*U->user_begin()); // Load from GEP? - } - - if (!Load || !Load->isSimple()) - continue; - - // FIXME: Expand to handle 64-bit implicit args and large merged loads. - LLVMContext &Ctx = F.getParent()->getContext(); - Type *LoadTy = Load->getType(); - HiddenArg HA = getHiddenArgFromOffset(Offset); - if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) - continue; - - ImplicitArgLoads.push_back(std::make_pair(Load, Offset)); - } - } - - if (ImplicitArgLoads.empty()) - return; - - // Allocate loads in order of offset. We need to be sure that the implicit - // argument can actually be preloaded. - std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second()); - - // If we fail to preload any implicit argument we know we don't have SGPRs - // to preload any subsequent ones with larger offsets. Find the first - // argument that we cannot preload. - auto *PreloadEnd = std::find_if( - ImplicitArgLoads.begin(), ImplicitArgLoads.end(), - [&](const std::pair &Load) { - unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType()); - unsigned LoadOffset = Load.second; - if (!tryAllocPreloadSGPRs(LoadSize, - LoadOffset + ImplicitArgsBaseOffset, - LastExplicitArgOffset)) - return true; - - LastExplicitArgOffset = - ImplicitArgsBaseOffset + LoadOffset + LoadSize; - return false; - }); - - if (PreloadEnd == ImplicitArgLoads.begin()) - return; - - unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second); - Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex); - assert(NF); - for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { - LoadInst *LoadInst = I->first; - unsigned LoadOffset = I->second; - unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset); - unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1; - Argument *Arg = NF->getArg(Index); - LoadInst->replaceAllUsesWith(Arg); - } - } -}; - class AMDGPULowerKernelArguments : public FunctionPass { public: static char ID; @@ -310,10 +86,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); uint64_t ExplicitArgOffset = 0; - // Preloaded kernel arguments must be sequential. - bool InPreloadSequence = true; - PreloadKernelArgInfo PreloadInfo(F, ST); - for (Argument &Arg : F.args()) { const bool IsByRef = Arg.hasByRefAttr(); Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); @@ -324,25 +96,10 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; - uint64_t LastExplicitArgOffset = ExplicitArgOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; - // Guard against the situation where hidden arguments have already been - // lowered and added to the kernel function signiture, i.e. in a situation - // where this pass has run twice. - if (Arg.hasAttribute("amdgpu-hidden-argument")) - break; - - // Try to preload this argument into user SGPRs. - if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() && - !Arg.getType()->isAggregateType()) - if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset, - LastExplicitArgOffset)) - continue; - - InPreloadSequence = false; - - if (Arg.use_empty()) + // Skip inreg arguments which should be preloaded. + if (Arg.use_empty() || Arg.hasInRegAttr()) continue; // If this is byval, the loads are already explicit in the function. We just @@ -482,14 +239,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { KernArgSegment->addRetAttr( Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); - if (InPreloadSequence) { - uint64_t ImplicitArgsBaseOffset = - alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) + - BaseOffset; - PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset, - ExplicitArgOffset, Builder); - } - return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 18610749c23e2..e2153bed41ed3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -186,6 +186,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" #include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" @@ -1544,6 +1545,8 @@ class AMDGPULowerModuleLDS { if (!MaxDepth || (A == 1 && !AliasScope)) return; + ScopedNoAliasAAResult ScopedNoAlias; + for (User *U : Ptr->users()) { if (auto *I = dyn_cast(U)) { if (AliasScope && I->mayReadOrWriteMemory()) { @@ -1553,7 +1556,34 @@ class AMDGPULowerModuleLDS { I->setMetadata(LLVMContext::MD_alias_scope, AS); MDNode *NA = I->getMetadata(LLVMContext::MD_noalias); - NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias); + + // Scoped aliases can originate from two different domains. + // First domain would be from LDS domain (created by this pass). + // All entries (LDS vars) into LDS struct will have same domain. + + // Second domain could be existing scoped aliases that are the + // results of noalias params and subsequent optimizations that + // may alter thesse sets. + + // We need to be careful how we create new alias sets, and + // have right scopes and domains for loads/stores of these new + // LDS variables. We intersect NoAlias set if alias sets belong + // to the same domain. This is the case if we have memcpy using + // LDS variables. Both src and dst of memcpy would belong to + // LDS struct, they donot alias. + // On the other hand, if one of the domains is LDS and other is + // existing domain prior to LDS, we need to have a union of all + // these aliases set to preserve existing aliasing information. + + SmallPtrSet ExistingDomains, LDSDomains; + ScopedNoAlias.collectScopedDomains(NA, ExistingDomains); + ScopedNoAlias.collectScopedDomains(NoAlias, LDSDomains); + auto Intersection = set_intersection(ExistingDomains, LDSDomains); + if (Intersection.empty()) { + NA = NA ? MDNode::concatenate(NA, NoAlias) : NoAlias; + } else { + NA = NA ? MDNode::intersect(NA, NoAlias) : NoAlias; + } I->setMetadata(LLVMContext::MD_noalias, NA); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 92cc4972fb65a..e8c794711889c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -16,6 +16,8 @@ #ifndef MODULE_PASS #define MODULE_PASS(NAME, CREATE_PASS) #endif +MODULE_PASS("amdgpu-expand-feature-predicates", + AMDGPUExpandFeaturePredicatesPass(*this)) MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass()) MODULE_PASS("amdgpu-lower-buffer-fat-pointers", AMDGPULowerBufferFatPointersPass(*this)) @@ -25,6 +27,7 @@ MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this)) MODULE_PASS("amdgpu-perf-hint", AMDGPUPerfHintAnalysisPass( *static_cast(this))) +MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this)) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp new file mode 100644 index 0000000000000..c1626b4fac869 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp @@ -0,0 +1,358 @@ +//===- AMDGPUPreloadKernelArguments.cpp - Preload Kernel Arguments --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This pass preloads kernel arguments into user_data SGPRs before kernel +/// execution begins. The number of registers available for preloading depends +/// on the number of free user SGPRs, up to the hardware's maximum limit. +/// Implicit arguments enabled in the kernel descriptor are allocated first, +/// followed by SGPRs used for preloaded kernel arguments. (Reference: +/// https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state) +/// Additionally, hidden kernel arguments may be preloaded, in which case they +/// are appended to the kernel signature after explicit arguments. Preloaded +/// arguments will be marked with `inreg`. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Pass.h" + +#define DEBUG_TYPE "amdgpu-preload-kernel-arguments" + +using namespace llvm; + +static cl::opt KernargPreloadCount( + "amdgpu-kernarg-preload-count", + cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0)); + +namespace { + +class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass { + const GCNTargetMachine *TM; + +public: + static char ID; + explicit AMDGPUPreloadKernelArgumentsLegacy( + const GCNTargetMachine *TM = nullptr); + + StringRef getPassName() const override { + return "AMDGPU Preload Kernel Arguments"; + } + + bool runOnModule(Module &M) override; +}; + +class PreloadKernelArgInfo { +private: + Function &F; + const GCNSubtarget &ST; + unsigned NumFreeUserSGPRs; + + enum HiddenArg : unsigned { + HIDDEN_BLOCK_COUNT_X, + HIDDEN_BLOCK_COUNT_Y, + HIDDEN_BLOCK_COUNT_Z, + HIDDEN_GROUP_SIZE_X, + HIDDEN_GROUP_SIZE_Y, + HIDDEN_GROUP_SIZE_Z, + HIDDEN_REMAINDER_X, + HIDDEN_REMAINDER_Y, + HIDDEN_REMAINDER_Z, + END_HIDDEN_ARGS + }; + + // Stores information about a specific hidden argument. + struct HiddenArgInfo { + // Offset in bytes from the location in the kernearg segment pointed to by + // the implicitarg pointer. + uint8_t Offset; + // The size of the hidden argument in bytes. + uint8_t Size; + // The name of the hidden argument in the kernel signature. + const char *Name; + }; + + static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { + {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"}, + {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"}, + {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"}, + {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"}, + {22, 2, "_hidden_remainder_z"}}; + + static HiddenArg getHiddenArgFromOffset(unsigned Offset) { + for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) + if (HiddenArgs[I].Offset == Offset) + return static_cast(I); + + return END_HIDDEN_ARGS; + } + + static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { + if (HA < END_HIDDEN_ARGS) + return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8); + + llvm_unreachable("Unexpected hidden argument."); + } + + static const char *getHiddenArgName(HiddenArg HA) { + if (HA < END_HIDDEN_ARGS) + return HiddenArgs[HA].Name; + + llvm_unreachable("Unexpected hidden argument."); + } + + // Clones the function after adding implicit arguments to the argument list + // and returns the new updated function. Preloaded implicit arguments are + // added up to and including the last one that will be preloaded, indicated by + // LastPreloadIndex. Currently preloading is only performed on the totality of + // sequential data from the kernarg segment including implicit (hidden) + // arguments. This means that all arguments up to the last preloaded argument + // will also be preloaded even if that data is unused. + Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { + FunctionType *FT = F.getFunctionType(); + LLVMContext &Ctx = F.getParent()->getContext(); + SmallVector FTypes(FT->param_begin(), FT->param_end()); + for (unsigned I = 0; I <= LastPreloadIndex; ++I) + FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I))); + + FunctionType *NFT = + FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg()); + Function *NF = + Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName()); + + NF->copyAttributesFrom(&F); + NF->copyMetadata(&F, 0); + NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat); + + F.getParent()->getFunctionList().insert(F.getIterator(), NF); + NF->takeName(&F); + NF->splice(NF->begin(), &F); + + Function::arg_iterator NFArg = NF->arg_begin(); + for (Argument &Arg : F.args()) { + Arg.replaceAllUsesWith(&*NFArg); + NFArg->takeName(&Arg); + ++NFArg; + } + + AttrBuilder AB(Ctx); + AB.addAttribute(Attribute::InReg); + AB.addAttribute("amdgpu-hidden-argument"); + AttributeList AL = NF->getAttributes(); + for (unsigned I = 0; I <= LastPreloadIndex; ++I) { + AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB); + NFArg++->setName(getHiddenArgName(HiddenArg(I))); + } + + NF->setAttributes(AL); + F.replaceAllUsesWith(NF); + + return NF; + } + +public: + PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { + setInitialFreeUserSGPRsCount(); + } + + // Returns the maximum number of user SGPRs that we have available to preload + // arguments. + void setInitialFreeUserSGPRsCount() { + GCNUserSGPRUsageInfo UserSGPRInfo(F, ST); + NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs(); + } + + bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) { + return ExplicitArgOffset <= NumFreeUserSGPRs * 4; + } + + // Try to allocate SGPRs to preload hidden kernel arguments. + void + tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, + SmallVectorImpl &FunctionsToErase) { + Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists( + F.getParent(), Intrinsic::amdgcn_implicitarg_ptr); + if (!ImplicitArgPtr) + return; + + const DataLayout &DL = F.getParent()->getDataLayout(); + // Pair is the load and the load offset. + SmallVector, 4> ImplicitArgLoads; + for (auto *U : ImplicitArgPtr->users()) { + Instruction *CI = dyn_cast(U); + if (!CI || CI->getParent()->getParent() != &F) + continue; + + for (auto *U : CI->users()) { + int64_t Offset = 0; + auto *Load = dyn_cast(U); // Load from ImplicitArgPtr? + if (!Load) { + if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) + continue; + + Load = dyn_cast(*U->user_begin()); // Load from GEP? + } + + if (!Load || !Load->isSimple()) + continue; + + // FIXME: Expand handle merged loads. + LLVMContext &Ctx = F.getParent()->getContext(); + Type *LoadTy = Load->getType(); + HiddenArg HA = getHiddenArgFromOffset(Offset); + if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) + continue; + + ImplicitArgLoads.push_back(std::make_pair(Load, Offset)); + } + } + + if (ImplicitArgLoads.empty()) + return; + + // Allocate loads in order of offset. We need to be sure that the implicit + // argument can actually be preloaded. + std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second()); + + // If we fail to preload any implicit argument we know we don't have SGPRs + // to preload any subsequent ones with larger offsets. Find the first + // argument that we cannot preload. + auto *PreloadEnd = + std::find_if(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), + [&](const std::pair &Load) { + unsigned LoadSize = + DL.getTypeStoreSize(Load.first->getType()); + unsigned LoadOffset = Load.second; + if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize + + ImplicitArgsBaseOffset)) + return true; + + return false; + }); + + if (PreloadEnd == ImplicitArgLoads.begin()) + return; + + unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second); + Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex); + assert(NF); + FunctionsToErase.push_back(&F); + for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { + LoadInst *LoadInst = I->first; + unsigned LoadOffset = I->second; + unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset); + unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1; + Argument *Arg = NF->getArg(Index); + LoadInst->replaceAllUsesWith(Arg); + } + } +}; + +} // end anonymous namespace + +char AMDGPUPreloadKernelArgumentsLegacy::ID = 0; + +INITIALIZE_PASS(AMDGPUPreloadKernelArgumentsLegacy, DEBUG_TYPE, + "AMDGPU Preload Kernel Arguments", false, false) + +ModulePass * +llvm::createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *TM) { + return new AMDGPUPreloadKernelArgumentsLegacy( + static_cast(TM)); +} + +AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy( + const GCNTargetMachine *TM) + : ModulePass(ID), TM(TM) {} + +static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) { + SmallVector FunctionsToErase; + bool Changed = false; + for (auto &F : M) { + const GCNSubtarget &ST = TM.getSubtarget(F); + if (!ST.hasKernargPreload() || + F.getCallingConv() != CallingConv::AMDGPU_KERNEL) + continue; + + PreloadKernelArgInfo PreloadInfo(F, ST); + uint64_t ExplicitArgOffset = 0; + const DataLayout &DL = F.getDataLayout(); + const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(); + unsigned NumPreloadsRequested = KernargPreloadCount; + unsigned NumPreloadedExplicitArgs = 0; + for (Argument &Arg : F.args()) { + // Avoid incompatible attributes and guard against running this pass + // twice. + // + // TODO: Preload byref kernel arguments + if (Arg.hasByRefAttr() || Arg.hasNestAttr() || + Arg.hasAttribute("amdgpu-hidden-argument")) + break; + + // Inreg may be pre-existing on some arguments, try to preload these. + if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr()) + break; + + // FIXME: Preload aggregates. + if (Arg.getType()->isAggregateType()) + break; + + Type *ArgTy = Arg.getType(); + Align ABITypeAlign = DL.getABITypeAlign(ArgTy); + uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); + ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; + + if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset)) + break; + + Arg.addAttr(Attribute::InReg); + NumPreloadedExplicitArgs++; + if (NumPreloadsRequested > 0) + NumPreloadsRequested--; + } + + // Only try preloading hidden arguments if we can successfully preload the + // last explicit argument. + if (NumPreloadedExplicitArgs == F.arg_size()) { + uint64_t ImplicitArgsBaseOffset = + alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) + + BaseOffset; + PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset, + FunctionsToErase); + } + + Changed |= NumPreloadedExplicitArgs > 0; + } + + // Erase cloned functions if we needed to update the kernel signature to + // support preloading hidden kernel arguments. + for (auto *F : FunctionsToErase) + F->eraseFromParent(); + + return Changed; +} + +bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(Module &M) { + if (skipModule(M) || !TM) + return false; + + return markKernelArgsAsInreg(M, *TM); +} + +PreservedAnalyses +AMDGPUPreloadKernelArgumentsPass::run(Module &M, ModuleAnalysisManager &AM) { + bool Changed = markKernelArgsAsInreg(M, TM); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index affee55704c79..22a4d0a7fcc23 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -178,12 +178,14 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass { } }; -unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) { +static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM, + const Function &F) { if (!TM.getTargetTriple().isAMDGCN()) return 128; const GCNSubtarget &ST = TM.getSubtarget(F); - unsigned MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + unsigned MaxVGPRs = ST.getMaxNumVGPRs( + ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first); // A non-entry function has only 32 caller preserved registers. // Do not promote alloca which will force spilling unless we know the function @@ -306,9 +308,9 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { if (!ST.isPromoteAllocaEnabled()) return false; - MaxVGPRs = getMaxVGPRs(TM, F); + MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F); - bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false; + bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F); // Use up to 1/4 of available register budget for vectorization. // FIXME: Increase the limit for whole function budgets? Perhaps x2? @@ -1340,29 +1342,14 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { } unsigned MaxOccupancy = - ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F).second; - - // Restrict local memory usage so that we don't drastically reduce occupancy, - // unless it is already significantly reduced. - - // TODO: Have some sort of hint or other heuristics to guess occupancy based - // on other factors.. - unsigned OccupancyHint = ST.getWavesPerEU(F).second; - if (OccupancyHint == 0) - OccupancyHint = 7; - - // Clamp to max value. - OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); - - // Check the hint but ignore it if it's obviously wrong from the existing LDS - // usage. - MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); + ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F) + .second; // Round up to the next tier of usage. unsigned MaxSizeWithWaveCount = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); - // Program is possibly broken by using more local mem than available. + // Program may already use more LDS than is usable at maximum occupancy. if (CurrentLocalMemUsage > MaxSizeWithWaveCount) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 224c368cff4a1..39b9959a306ba 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -733,7 +733,7 @@ Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B, for (unsigned i = 0; i < NumParts; ++i) { Register SrcPart = SrcParts[i]; - Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); MRI.setType(DstPart, NumParts == 1 ? Ty : S32); const TargetRegisterClass *Constrained = @@ -3217,10 +3217,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg); return; } - case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { - unsigned N = MI.getNumExplicitOperands() - 2; + case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: + case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY: + case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: { + bool IsDualOrBVH8 = + MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY || + MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY; + unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier + unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods; applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(B, MI, {N}); + executeInWaterfallLoop(B, MI, {LastRegOpIdx}); return; } case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: @@ -4561,6 +4567,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_dot4_f32_bf8_bf8: case Intrinsic::amdgcn_cvt_f32_fp8: case Intrinsic::amdgcn_cvt_f32_bf8: + case Intrinsic::amdgcn_cvt_off_f32_i4: case Intrinsic::amdgcn_cvt_pk_f32_fp8: case Intrinsic::amdgcn_cvt_pk_f32_bf8: case Intrinsic::amdgcn_cvt_pk_fp8_f32: @@ -5012,11 +5019,27 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { assert(RSrcIntrin->IsImage); return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); } - case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { - unsigned N = MI.getNumExplicitOperands() - 2; - OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); - OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); - if (N == 3) { + case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: + case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY: + case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: { + bool IsDualOrBVH8 = + MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY || + MI.getOpcode() == AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY; + unsigned NumMods = IsDualOrBVH8 ? 0 : 1; // Has A16 modifier + unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods; + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + if (IsDualOrBVH8) { + OpdsMapping[1] = AMDGPU::getValueMapping( + AMDGPU::VGPRRegBankID, + MRI.getType(MI.getOperand(1).getReg()).getSizeInBits()); + OpdsMapping[2] = AMDGPU::getValueMapping( + AMDGPU::VGPRRegBankID, + MRI.getType(MI.getOperand(2).getReg()).getSizeInBits()); + } + OpdsMapping[LastRegOpIdx] = + getSGPROpMapping(MI.getOperand(LastRegOpIdx).getReg(), MRI, *TRI); + if (LastRegOpIdx == 3) { // Sequential form: all operands combined into VGPR256/VGPR512 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); if (Size > 256) @@ -5024,7 +5047,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); } else { // NSA form - for (unsigned I = 2; I < N; ++I) { + unsigned FirstSrcOpIdx = IsDualOrBVH8 ? 4 : 2; + for (unsigned I = FirstSrcOpIdx; I < LastRegOpIdx; ++I) { unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits(); OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); } @@ -5254,7 +5278,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; - case Intrinsic::amdgcn_ds_bvh_stack_rtn: { + case Intrinsic::amdgcn_ds_bvh_stack_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: { OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst OpdsMapping[1] = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index d98a0ffcaf7e3..510d9b97b28f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -55,9 +55,9 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, return getLocalMemorySize() / WorkGroupsPerCU; } -std::pair -AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, - const Function &F) const { +std::pair AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( + uint32_t LDSBytes, std::pair FlatWorkGroupSizes) const { + // FIXME: We should take into account the LDS allocation granularity. const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u); @@ -81,7 +81,7 @@ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, // workgroups, maximum number of waves, and minimum occupancy. The opposite is // generally true for the minimum group size. LDS or barrier ressource // limitations can flip those minimums/maximums. - const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F); + const auto [MinWGSize, MaxWGSize] = FlatWorkGroupSizes; auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize); auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize); @@ -180,45 +180,52 @@ std::pair AMDGPUSubtarget::getFlatWorkGroupSizes( } std::pair AMDGPUSubtarget::getEffectiveWavesPerEU( - std::pair Requested, - std::pair FlatWorkGroupSizes) const { - // Default minimum/maximum number of waves per execution unit. - std::pair Default(1, getMaxWavesPerEU()); - - // If minimum/maximum flat work group sizes were explicitly requested using - // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum - // number of waves per execution unit to values implied by requested - // minimum/maximum flat work group sizes. - unsigned MinImpliedByFlatWorkGroupSize = - getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); - Default.first = MinImpliedByFlatWorkGroupSize; - - // Make sure requested minimum is less than requested maximum. - if (Requested.second && Requested.first > Requested.second) + std::pair RequestedWavesPerEU, + std::pair FlatWorkGroupSizes, unsigned LDSBytes) const { + // Default minimum/maximum number of waves per EU. The range of flat workgroup + // sizes limits the achievable maximum, and we aim to support enough waves per + // EU so that we can concurrently execute all waves of a single workgroup of + // maximum size on a CU. + std::pair Default = { + getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second), + getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second}; + Default.first = std::min(Default.first, Default.second); + + // Make sure requested minimum is within the default range and lower than the + // requested maximum. The latter must not violate target specification. + if (RequestedWavesPerEU.first < Default.first || + RequestedWavesPerEU.first > Default.second || + RequestedWavesPerEU.first > RequestedWavesPerEU.second || + RequestedWavesPerEU.second > getMaxWavesPerEU()) return Default; - // Make sure requested values do not violate subtarget's specifications. - if (Requested.first < getMinWavesPerEU() || - Requested.second > getMaxWavesPerEU()) - return Default; - - // Make sure requested values are compatible with values implied by requested - // minimum/maximum flat work group sizes. - if (Requested.first < MinImpliedByFlatWorkGroupSize) - return Default; + // We cannot exceed maximum occupancy implied by flat workgroup size and LDS. + RequestedWavesPerEU.second = + std::min(RequestedWavesPerEU.second, Default.second); + return RequestedWavesPerEU; +} - return Requested; +std::pair +AMDGPUSubtarget::getWavesPerEU(const Function &F) const { + // Default/requested minimum/maximum flat work group sizes. + std::pair FlatWorkGroupSizes = getFlatWorkGroupSizes(F); + // Minimum number of bytes allocated in the LDS. + unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", + {0, UINT32_MAX}, true) + .first; + return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F); } -std::pair AMDGPUSubtarget::getWavesPerEU( - const Function &F, std::pair FlatWorkGroupSizes) const { +std::pair +AMDGPUSubtarget::getWavesPerEU(std::pair FlatWorkGroupSizes, + unsigned LDSBytes, const Function &F) const { // Default minimum/maximum number of waves per execution unit. std::pair Default(1, getMaxWavesPerEU()); // Requested minimum/maximum number of waves per execution unit. std::pair Requested = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true); - return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes); + return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes); } static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 5944b69ce6416..21b8c94a7ae5e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -106,11 +106,7 @@ class AMDGPUSubtarget { /// be converted to integer, violate subtarget's specifications, or are not /// compatible with minimum/maximum number of waves limited by flat work group /// size, register usage, and/or lds usage. - std::pair getWavesPerEU(const Function &F) const { - // Default/requested minimum/maximum flat work group sizes. - std::pair FlatWorkGroupSizes = getFlatWorkGroupSizes(F); - return getWavesPerEU(F, FlatWorkGroupSizes); - } + std::pair getWavesPerEU(const Function &F) const; /// Overload which uses the specified values for the flat work group sizes, /// rather than querying the function itself. \p FlatWorkGroupSizes Should @@ -118,9 +114,23 @@ class AMDGPUSubtarget { std::pair getWavesPerEU(const Function &F, std::pair FlatWorkGroupSizes) const; - std::pair getEffectiveWavesPerEU( - std::pair WavesPerEU, - std::pair FlatWorkGroupSizes) const; + + /// Overload which uses the specified values for the flat workgroup sizes and + /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes + /// should correspond to the function's value for getFlatWorkGroupSizes and \p + /// LDSBytes to the per-workgroup LDS allocation. + std::pair + getWavesPerEU(std::pair FlatWorkGroupSizes, + unsigned LDSBytes, const Function &F) const; + + /// Returns the target minimum/maximum number of waves per EU. This is based + /// on the minimum/maximum number of \p RequestedWavesPerEU and further + /// limited by the maximum achievable occupancy derived from the range of \p + /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. + std::pair + getEffectiveWavesPerEU(std::pair RequestedWavesPerEU, + std::pair FlatWorkGroupSizes, + unsigned LDSBytes) const; /// Return the amount of LDS that can be used that will not restrict the /// occupancy lower than WaveCount. @@ -133,7 +143,16 @@ class AMDGPUSubtarget { /// This notably depends on the range of allowed flat group sizes for the /// function and hardware characteristics. std::pair - getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const; + getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const { + return getOccupancyWithWorkGroupSizes(LDSBytes, getFlatWorkGroupSizes(F)); + } + + /// Overload which uses the specified values for the flat work group sizes, + /// rather than querying the function itself. \p FlatWorkGroupSizes should + /// correspond to the function's value for getFlatWorkGroupSizes. + std::pair getOccupancyWithWorkGroupSizes( + uint32_t LDSBytes, + std::pair FlatWorkGroupSizes) const; /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can /// be achieved when the only function running on a CU is \p MF. This notably diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index 17207773b4858..270bb66dbaa9a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -192,8 +192,7 @@ class AMDGPUSwLowerLDS { void getLDSMemoryInstructions(Function *Func, SetVector &LDSInstructions); void replaceKernelLDSAccesses(Function *Func); - Value *getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr, - Value *LDSPtr); + Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr); void translateLDSMemoryOperationsToGlobalMemory( Function *Func, Value *LoadMallocPtr, SetVector &LDSInstructions); @@ -299,8 +298,7 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() { for (User *V : GV->users()) { if (auto *I = dyn_cast(V)) { Function *F = I->getFunction(); - if (!isKernelLDS(F) && F->hasFnAttribute(Attribute::SanitizeAddress) && - !F->isDeclaration()) + if (!isKernelLDS(F) && !F->isDeclaration()) FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV); } } @@ -655,20 +653,30 @@ void AMDGPUSwLowerLDS::getLDSMemoryInstructions( } else if (AtomicCmpXchgInst *XCHG = dyn_cast(&Inst)) { if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) LDSInstructions.insert(&Inst); + } else if (AddrSpaceCastInst *ASC = dyn_cast(&Inst)) { + if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) + LDSInstructions.insert(&Inst); } else continue; } } } -Value * -AMDGPUSwLowerLDS::getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr, +Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr) { assert(LDSPtr && "Invalid LDS pointer operand"); - Value *PtrToInt = IRB.CreatePtrToInt(LDSPtr, IRB.getInt32Ty()); - Value *GEP = - IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {PtrToInt}); - return GEP; + Type *LDSPtrType = LDSPtr->getType(); + LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + Type *IntTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); + if (auto *VecPtrTy = dyn_cast(LDSPtrType)) { + // Handle vector of pointers + ElementCount NumElements = VecPtrTy->getElementCount(); + IntTy = VectorType::get(IntTy, NumElements); + } + Value *GepIndex = IRB.CreatePtrToInt(LDSPtr, IntTy); + return IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {GepIndex}); } void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( @@ -681,7 +689,7 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( if (LoadInst *LI = dyn_cast(Inst)) { Value *LIOperand = LI->getPointerOperand(); Value *Replacement = - getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, LIOperand); + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LIOperand); LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement, LI->getAlign(), LI->isVolatile()); NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); @@ -691,7 +699,7 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( } else if (StoreInst *SI = dyn_cast(Inst)) { Value *SIOperand = SI->getPointerOperand(); Value *Replacement = - getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, SIOperand); + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, SIOperand); StoreInst *NewSI = IRB.CreateAlignedStore( SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile()); NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); @@ -701,8 +709,8 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( } else if (AtomicRMWInst *RMW = dyn_cast(Inst)) { Value *RMWPtrOperand = RMW->getPointerOperand(); Value *RMWValOperand = RMW->getValOperand(); - Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer( - LoadMallocPtr, RMWPtrOperand); + Value *Replacement = + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, RMWPtrOperand); AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW( RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(), RMW->getOrdering(), RMW->getSyncScopeID()); @@ -712,8 +720,8 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( RMW->eraseFromParent(); } else if (AtomicCmpXchgInst *XCHG = dyn_cast(Inst)) { Value *XCHGPtrOperand = XCHG->getPointerOperand(); - Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer( - LoadMallocPtr, XCHGPtrOperand); + Value *Replacement = + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, XCHGPtrOperand); AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg( Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(), XCHG->getAlign(), XCHG->getSuccessOrdering(), @@ -722,6 +730,16 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( AsanInfo.Instructions.insert(NewXCHG); XCHG->replaceAllUsesWith(NewXCHG); XCHG->eraseFromParent(); + } else if (AddrSpaceCastInst *ASC = dyn_cast(Inst)) { + Value *AIOperand = ASC->getPointerOperand(); + Value *Replacement = + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, AIOperand); + Value *NewAI = IRB.CreateAddrSpaceCast(Replacement, ASC->getType()); + // Note: No need to add the instruction to AsanInfo instructions to be + // instrumented list. FLAT_ADDRESS ptr would have been already + // instrumented by asan pass prior to this pass. + ASC->replaceAllUsesWith(NewAI); + ASC->eraseFromParent(); } else report_fatal_error("Unimplemented LDS lowering instruction"); } @@ -1115,6 +1133,17 @@ void AMDGPUSwLowerLDS::initAsanInfo() { AsanInfo.Offset = Offset; } +static bool hasFnWithSanitizeAddressAttr(FunctionVariableMap &LDSAccesses) { + for (auto &K : LDSAccesses) { + Function *F = K.first; + if (!F) + continue; + if (F->hasFnAttribute(Attribute::SanitizeAddress)) + return true; + } + return false; +} + bool AMDGPUSwLowerLDS::run() { bool Changed = false; @@ -1125,6 +1154,14 @@ bool AMDGPUSwLowerLDS::run() { // Get all the direct and indirect access of LDS for all the kernels. LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); + // Flag to decide whether to lower all the LDS accesses + // based on sanitize_address attribute. + bool LowerAllLDS = hasFnWithSanitizeAddressAttr(LDSUsesInfo.direct_access) || + hasFnWithSanitizeAddressAttr(LDSUsesInfo.indirect_access); + + if (!LowerAllLDS) + return Changed; + // Utility to group LDS access into direct, indirect, static and dynamic. auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses, bool DirectAccess) { @@ -1134,8 +1171,6 @@ bool AMDGPUSwLowerLDS::run() { continue; assert(isKernelLDS(F)); - if (!F->hasFnAttribute(Attribute::SanitizeAddress)) - continue; // Only inserts if key isn't already in the map. FuncLDSAccessInfo.KernelToLDSParametersMap.insert( @@ -1202,6 +1237,7 @@ bool AMDGPUSwLowerLDS::run() { // Get non-kernels with LDS ptr as argument and called by kernels. getNonKernelsWithLDSArguments(CG); + // Lower LDS accesses in non-kernels. if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() || !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) { NonKernelLDSParameters NKLDSParams; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 9572c0a916f8a..2174db5e1b4a5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -29,6 +29,7 @@ #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" #include "AMDGPUUnifyDivergentExitNodes.h" +#include "AMDGPUWaitSGPRHazards.h" #include "GCNDPPCombine.h" #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" @@ -550,6 +551,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeGCNRewritePartialRegUsesPass(*PR); initializeGCNRegPressurePrinterPass(*PR); initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); + initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); + initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -602,12 +605,15 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); return DAG; } static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { - return new GCNIterativeScheduler(C, - GCNIterativeScheduler::SCHEDULE_MINREGFORCED); + auto *DAG = new GCNIterativeScheduler( + C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED); + DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); + return DAG; } static ScheduleDAGInstrs * @@ -618,6 +624,7 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); + DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); return DAG; } @@ -782,7 +789,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #include "llvm/Passes/TargetPassRegistry.inc" PB.registerPipelineStartEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level) { + [this](ModulePassManager &PM, OptimizationLevel Level) { + PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this)); if (EnableHipStdPar) PM.addPass(HipStdParAcceleratorCodeSelectionPass()); }); @@ -792,6 +800,16 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { ThinOrFullLTOPhase Phase) { PM.addPass(AMDGPUPrintfRuntimeBindingPass()); + if (!isLTOPreLink(Phase)) { + // When we are not using -fgpu-rdc, we can run accelerator code + // selection relatively early, but still after linking to prevent + // eager removal of potentially reachable symbols. + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); + PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } + } + if (Level == OptimizationLevel::O0) return; @@ -859,6 +877,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { } }); + PB.registerFullLinkTimeOptimizationEarlyEPCallback( + [this](ModulePassManager &PM, OptimizationLevel) { + PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this)); + }); PB.registerFullLinkTimeOptimizationLastEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { // Promote kernel arguments to global address space for LLVM IR @@ -866,6 +888,15 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { FunctionPassManager FPM; FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + + // When we are using -fgpu-rdc, we can only run accelerator code + // selection after linking to prevent, otherwise we end up removing + // potentially reachable symbols that were exported as external in other + // modules. + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); + PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. @@ -1147,6 +1178,7 @@ class GCNPassConfig final : public AMDGPUPassConfig { void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; + void addPostBBSections() override; }; } // end anonymous namespace @@ -1295,8 +1327,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() { addPass(createAMDGPUAnnotateKernelFeaturesPass()); } - if (TM->getTargetTriple().getArch() == Triple::amdgcn && - EnableLowerKernelArguments) + if (TM->getTargetTriple().isAMDGCN() && + TM->getOptLevel() > CodeGenOptLevel::None) + addPass(createAMDGPUPreloadKernelArgumentsLegacyPass(TM)); + + if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); if (TM->getTargetTriple().getArch() == Triple::amdgcn) { @@ -1382,6 +1417,15 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( if (SchedStrategy == "max-memory-clause") return createGCNMaxMemoryClauseMachineScheduler(C); + if (SchedStrategy == "iterative-ilp") + return createIterativeILPMachineScheduler(C); + + if (SchedStrategy == "iterative-minreg") + return createMinRegScheduler(C); + + if (SchedStrategy == "iterative-maxocc") + return createIterativeGCNMaxOccupancyMachineScheduler(C); + return createGCNMaxOccupancyMachineScheduler(C); } @@ -1714,10 +1758,17 @@ void GCNPassConfig::addPreEmitPass() { // cases. addPass(&PostRAHazardRecognizerID); + addPass(&AMDGPUWaitSGPRHazardsLegacyID); + if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) addPass(&AMDGPUInsertDelayAluID); addPass(&BranchRelaxationPassID); +} + +void GCNPassConfig::addPostBBSections() { + // We run this later to avoid passes like livedebugvalues and BBSections + // having to deal with the apparent multi-entry functions we may generate. addPass(createAMDGPUPreloadKernArgPrologLegacyPass()); } @@ -2044,6 +2095,9 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { // AMDGPUAnnotateKernelFeaturesPass is missing here, but it will hopefully be // deleted soon. + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(AMDGPUPreloadKernelArgumentsPass(TM)); + if (EnableLowerKernelArguments) addPass(AMDGPULowerKernelArgumentsPass(TM)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 5160851f8c442..842bdcabac366 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -313,6 +313,24 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const { return !F || !ST->isSingleLaneExecution(*F); } +unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) { + // For certain 8 bit ops, we can pack a v4i8 into a single part + // (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we + // do not limit the numberOfParts for 8 bit vectors to the + // legalization costs of such. It is left up to other target + // queries (e.g. get*InstrCost) to decide the proper handling + // of 8 bit vectors. + if (FixedVectorType *VTy = dyn_cast(Tp)) { + if (ST->shouldCoerceIllegalTypes() && + DL.getTypeSizeInBits(VTy->getElementType()) == 8) { + unsigned ElCount = VTy->getElementCount().getFixedValue(); + return std::max(UINT64_C(1), PowerOf2Ceil(ElCount / 4)); + } + } + + return BaseT::getNumberOfParts(Tp); +} + unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector // registers. See getRegisterClassForType for the implementation. @@ -344,9 +362,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { if (Opcode == Instruction::Load || Opcode == Instruction::Store) return 32 * 4 / ElemWidth; - return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 - : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 - : 1; + + return (ST->shouldCoerceIllegalTypes() && ElemWidth == 8) ? 4 + : (ElemWidth == 16) ? 2 + : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 + : 1; } unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, @@ -1154,14 +1174,16 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp); - // Larger vector widths may require additional instructions, but are - // typically cheaper than scalarized versions. - unsigned NumVectorElts = cast(VT)->getNumElements(); + unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType()); if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && - DL.getTypeSizeInBits(VT->getElementType()) == 16) { - bool HasVOP3P = ST->hasVOP3PInsts(); + (ScalarSize == 16 || + (ScalarSize == 8 && ST->shouldCoerceIllegalTypes()))) { + // Larger vector widths may require additional instructions, but are + // typically cheaper than scalarized versions. + unsigned NumVectorElts = cast(VT)->getNumElements(); unsigned RequestedElts = count_if(Mask, [](int MaskElt) { return MaskElt != -1; }); + unsigned EltsPerReg = 32 / ScalarSize; if (RequestedElts == 0) return 0; switch (Kind) { @@ -1170,9 +1192,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, case TTI::SK_PermuteSingleSrc: { // With op_sel VOP3P instructions freely can access the low half or high // half of a register, so any swizzle of two elements is free. - if (HasVOP3P && NumVectorElts == 2) + if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2) return 0; - unsigned NumPerms = alignTo(RequestedElts, 2) / 2; + unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg; // SK_Broadcast just reuses the same mask unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms; return NumPerms + NumPermMasks; @@ -1184,12 +1206,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return 0; // Insert/extract subvectors only require shifts / extract code to get the // relevant bits - return alignTo(RequestedElts, 2) / 2; + return alignTo(RequestedElts, EltsPerReg) / EltsPerReg; } case TTI::SK_PermuteTwoSrc: case TTI::SK_Splice: case TTI::SK_Select: { - unsigned NumPerms = alignTo(RequestedElts, 2) / 2; + unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg; // SK_Select just reuses the same mask unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms; return NumPerms + NumPermMasks; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 585f38fc02c29..2fd76e8a0e372 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -118,6 +118,7 @@ class GCNTTIImpl final : public BasicTTIImplBase { return TTI::PSK_FastHardware; } + unsigned getNumberOfParts(Type *Tp); unsigned getNumberOfRegisters(unsigned RCID) const; TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const; unsigned getMinVectorRegisterBitWidth() const; @@ -226,6 +227,10 @@ class GCNTTIImpl final : public BasicTTIImplBase { std::optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const; + + Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC, + IntrinsicInst &II) const; + std::optional simplifyDemandedVectorEltsIntrinsic( InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index fda2a38c2464e..d087fbc86545c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -215,7 +215,10 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, PDT.roots(), [&](auto BB) { return !isUniformlyReached(UA, *BB); }); for (BasicBlock *BB : PDT.roots()) { - if (isa(BB->getTerminator())) { + if (auto *RI = dyn_cast(BB->getTerminator())) { + auto *CI = dyn_cast_or_null(RI->getPrevNode()); + if (CI && CI->isMustTailCall()) + continue; if (HasDivergentExitBlock) ReturningBlocks.push_back(BB); } else if (isa(BB->getTerminator())) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp new file mode 100644 index 0000000000000..e70d6aab306fe --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp @@ -0,0 +1,517 @@ +//===- AMDGPUWaitSGPRHazards.cpp - Insert waits for SGPR read hazards -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Insert s_wait_alu instructions to mitigate SGPR read hazards on GFX12. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUWaitSGPRHazards.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/SetVector.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-wait-sgpr-hazards" + +static cl::opt GlobalEnableSGPRHazardWaits( + "amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden, + cl::desc("Enable required s_wait_alu on SGPR hazards")); + +static cl::opt GlobalCullSGPRHazardsOnFunctionBoundary( + "amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden, + cl::desc("Cull hazards on function boundaries")); + +static cl::opt + GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull", + cl::init(false), cl::Hidden, + cl::desc("Cull hazards on memory waits")); + +static cl::opt GlobalCullSGPRHazardsMemWaitThreshold( + "amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden, + cl::desc("Number of tracked SGPRs before initiating hazard cull on memory " + "wait")); + +namespace { + +class AMDGPUWaitSGPRHazards { +public: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + unsigned DsNopCount; + + bool EnableSGPRHazardWaits; + bool CullSGPRHazardsOnFunctionBoundary; + bool CullSGPRHazardsAtMemWait; + unsigned CullSGPRHazardsMemWaitThreshold; + + AMDGPUWaitSGPRHazards() {} + + // Return the numeric ID 0-127 for a given SGPR. + static std::optional sgprNumber(Register Reg, + const SIRegisterInfo &TRI) { + switch (Reg) { + case AMDGPU::M0: + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::SGPR_NULL: + case AMDGPU::SGPR_NULL64: + return {}; + default: + break; + } + unsigned RegN = TRI.getHWRegIndex(Reg); + if (RegN > 127) + return {}; + return RegN; + } + + static inline bool isVCC(Register Reg) { + return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI; + } + + // Adjust global offsets for instructions bundled with S_GETPC_B64 after + // insertion of a new instruction. + static void updateGetPCBundle(MachineInstr *NewMI) { + if (!NewMI->isBundled()) + return; + + // Find start of bundle. + auto I = NewMI->getIterator(); + while (I->isBundledWithPred()) + I--; + if (I->isBundle()) + I++; + + // Bail if this is not an S_GETPC bundle. + if (I->getOpcode() != AMDGPU::S_GETPC_B64) + return; + + // Update offsets of any references in the bundle. + const unsigned NewBytes = 4; + assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + "Unexpected instruction insertion in bundle"); + auto NextMI = std::next(NewMI->getIterator()); + auto End = NewMI->getParent()->end(); + while (NextMI != End && NextMI->isBundledWithPred()) { + for (auto &Operand : NextMI->operands()) { + if (Operand.isGlobal()) + Operand.setOffset(Operand.getOffset() + NewBytes); + } + NextMI++; + } + } + + struct HazardState { + static constexpr unsigned None = 0; + static constexpr unsigned SALU = (1 << 0); + static constexpr unsigned VALU = (1 << 1); + + std::bitset<64> Tracked; // SGPR banks ever read by VALU + std::bitset<128> SALUHazards; // SGPRs with uncommitted values from SALU + std::bitset<128> VALUHazards; // SGPRs with uncommitted values from VALU + unsigned VCCHazard = None; // Source of current VCC writes + bool ActiveFlat = false; // Has unwaited flat instructions + + bool merge(const HazardState &RHS) { + HazardState Orig(*this); + *this |= RHS; + return (*this != Orig); + } + + bool operator==(const HazardState &RHS) const { + return Tracked == RHS.Tracked && SALUHazards == RHS.SALUHazards && + VALUHazards == RHS.VALUHazards && VCCHazard == RHS.VCCHazard && + ActiveFlat == RHS.ActiveFlat; + } + + bool operator!=(const HazardState &RHS) const { return !(*this == RHS); } + + void operator|=(const HazardState &RHS) { + Tracked |= RHS.Tracked; + SALUHazards |= RHS.SALUHazards; + VALUHazards |= RHS.VALUHazards; + VCCHazard |= RHS.VCCHazard; + ActiveFlat |= RHS.ActiveFlat; + } + }; + + struct BlockHazardState { + HazardState In; + HazardState Out; + }; + + DenseMap BlockState; + + static constexpr unsigned WAVE32_NOPS = 4; + static constexpr unsigned WAVE64_NOPS = 8; + + void insertHazardCull(MachineBasicBlock &MBB, + MachineBasicBlock::instr_iterator &MI) { + assert(!MI->isBundled()); + unsigned Count = DsNopCount; + while (Count--) + BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::DS_NOP)); + } + + bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { + enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 }; + + HazardState State = BlockState[&MBB].In; + SmallSet SeenRegs; + bool Emitted = false; + unsigned DsNops = 0; + + for (MachineBasicBlock::instr_iterator MI = MBB.instr_begin(), + E = MBB.instr_end(); + MI != E; ++MI) { + if (MI->isMetaInstruction()) + continue; + + // Clear tracked SGPRs if sufficient DS_NOPs occur + if (MI->getOpcode() == AMDGPU::DS_NOP) { + if (++DsNops >= DsNopCount) + State.Tracked.reset(); + continue; + } + DsNops = 0; + + // Snoop FLAT instructions to avoid adding culls before scratch/lds loads. + // Culls could be disproportionate in cost to load time. + if (SIInstrInfo::isFLAT(*MI) && !SIInstrInfo::isFLATGlobal(*MI)) + State.ActiveFlat = true; + + // SMEM or VMEM clears hazards + if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSMRD(*MI)) { + State.VCCHazard = HazardState::None; + State.SALUHazards.reset(); + State.VALUHazards.reset(); + continue; + } + + // Existing S_WAITALU can clear hazards + if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) { + unsigned int Mask = MI->getOperand(0).getImm(); + if (AMDGPU::DepCtr::decodeFieldVaVcc(Mask) == 0) + State.VCCHazard &= ~HazardState::VALU; + if (AMDGPU::DepCtr::decodeFieldSaSdst(Mask) == 0) { + State.SALUHazards.reset(); + State.VCCHazard &= ~HazardState::SALU; + } + if (AMDGPU::DepCtr::decodeFieldVaSdst(Mask) == 0) + State.VALUHazards.reset(); + continue; + } + + // Snoop counter waits to insert culls + if (CullSGPRHazardsAtMemWait && + (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT || + MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT || + MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) && + (MI->getOperand(0).isImm() && MI->getOperand(0).getImm() == 0) && + (State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) { + if (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) { + State.ActiveFlat = false; + } else { + State.Tracked.reset(); + if (Emit) + insertHazardCull(MBB, MI); + continue; + } + } + + // Process only VALUs and SALUs + bool IsVALU = SIInstrInfo::isVALU(*MI); + bool IsSALU = SIInstrInfo::isSALU(*MI); + if (!IsVALU && !IsSALU) + continue; + + unsigned Wait = 0; + + auto processOperand = [&](const MachineOperand &Op, bool IsUse) { + if (!Op.isReg()) + return; + Register Reg = Op.getReg(); + assert(!Op.getSubReg()); + if (!TRI->isSGPRReg(*MRI, Reg)) + return; + + // Only visit each register once + if (!SeenRegs.insert(Reg).second) + return; + + auto RegNumber = sgprNumber(Reg, *TRI); + if (!RegNumber) + return; + + // Track SGPRs by pair -- numeric ID of an 64b SGPR pair. + // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc + unsigned RegN = *RegNumber; + unsigned PairN = (RegN >> 1) & 0x3f; + + // Read/write of untracked register is safe; but must record any new + // reads. + if (!State.Tracked[PairN]) { + if (IsVALU && IsUse) + State.Tracked.set(PairN); + return; + } + + uint8_t SGPRCount = + AMDGPU::getRegBitWidth(*TRI->getRegClassForReg(*MRI, Reg)) / 32; + + if (IsUse) { + // SALU reading SGPR clears VALU hazards + if (IsSALU) { + if (isVCC(Reg)) { + if (State.VCCHazard & HazardState::VALU) + State.VCCHazard = HazardState::None; + } else { + State.VALUHazards.reset(); + } + } + // Compute required waits + for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) { + Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0; + Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0; + } + if (isVCC(Reg) && State.VCCHazard) { + // Note: it's possible for both SALU and VALU to exist if VCC + // was updated differently by merged predecessors. + if (State.VCCHazard & HazardState::SALU) + Wait |= WA_SALU; + if (State.VCCHazard & HazardState::VALU) + Wait |= WA_VCC; + } + } else { + // Update hazards + if (isVCC(Reg)) { + State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU; + } else { + for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) { + if (IsSALU) + State.SALUHazards.set(RegN + RegIdx); + else + State.VALUHazards.set(RegN + RegIdx); + } + } + } + }; + + const bool IsSetPC = + (MI->isCall() || MI->isReturn() || MI->isIndirectBranch()) && + MI->getOpcode() != AMDGPU::S_ENDPGM && + MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED; + + // Only consider implicit VCC specified by instruction descriptor. + const bool HasImplicitVCC = + llvm::any_of(MI->getDesc().implicit_uses(), + [](MCPhysReg Reg) { return isVCC(Reg); }) || + llvm::any_of(MI->getDesc().implicit_defs(), + [](MCPhysReg Reg) { return isVCC(Reg); }); + + if (IsSetPC) { + // All SGPR writes before a call/return must be flushed as the + // callee/caller will not will not see the hazard chain. + if (State.VCCHazard & HazardState::VALU) + Wait |= WA_VCC; + if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU)) + Wait |= WA_SALU; + if (State.VALUHazards.any()) + Wait |= WA_VALU; + if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) { + State.Tracked.reset(); + if (Emit) + insertHazardCull(MBB, MI); + } + } else { + // Process uses to determine required wait. + SeenRegs.clear(); + for (const MachineOperand &Op : MI->all_uses()) { + if (Op.isImplicit() && + (!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg()))) + continue; + processOperand(Op, true); + } + } + + // Apply wait + if (Wait) { + unsigned Mask = 0xffff; + if (Wait & WA_VCC) { + State.VCCHazard &= ~HazardState::VALU; + Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0); + } + if (Wait & WA_SALU) { + State.SALUHazards.reset(); + State.VCCHazard &= ~HazardState::SALU; + Mask = AMDGPU::DepCtr::encodeFieldSaSdst(Mask, 0); + } + if (Wait & WA_VALU) { + State.VALUHazards.reset(); + Mask = AMDGPU::DepCtr::encodeFieldVaSdst(Mask, 0); + } + if (Emit) { + auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(Mask); + updateGetPCBundle(NewMI); + Emitted = true; + } + } + + // On return from a call SGPR state is unknown, so all potential hazards. + if (MI->isCall() && !CullSGPRHazardsOnFunctionBoundary) + State.Tracked.set(); + + // Update hazards based on defs. + SeenRegs.clear(); + for (const MachineOperand &Op : MI->all_defs()) { + if (Op.isImplicit() && + (!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg()))) + continue; + processOperand(Op, false); + } + } + + bool Changed = State != BlockState[&MBB].Out; + if (Emit) { + assert(!Changed && "Hazard state should not change on emit pass"); + return Emitted; + } + if (Changed) + BlockState[&MBB].Out = State; + return Changed; + } + + bool run(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + if (!ST.hasVALUReadSGPRHazard()) + return false; + + // Parse settings + EnableSGPRHazardWaits = GlobalEnableSGPRHazardWaits; + CullSGPRHazardsOnFunctionBoundary = GlobalCullSGPRHazardsOnFunctionBoundary; + CullSGPRHazardsAtMemWait = GlobalCullSGPRHazardsAtMemWait; + CullSGPRHazardsMemWaitThreshold = GlobalCullSGPRHazardsMemWaitThreshold; + + if (!GlobalEnableSGPRHazardWaits.getNumOccurrences()) + EnableSGPRHazardWaits = MF.getFunction().getFnAttributeAsParsedInteger( + "amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits); + if (!GlobalCullSGPRHazardsOnFunctionBoundary.getNumOccurrences()) + CullSGPRHazardsOnFunctionBoundary = + MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-boundary-cull"); + if (!GlobalCullSGPRHazardsAtMemWait.getNumOccurrences()) + CullSGPRHazardsAtMemWait = + MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-mem-wait-cull"); + if (!GlobalCullSGPRHazardsMemWaitThreshold.getNumOccurrences()) + CullSGPRHazardsMemWaitThreshold = + MF.getFunction().getFnAttributeAsParsedInteger( + "amdgpu-sgpr-hazard-mem-wait-cull-threshold", + CullSGPRHazardsMemWaitThreshold); + + // Bail if disabled + if (!EnableSGPRHazardWaits) + return false; + + TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + MRI = &MF.getRegInfo(); + DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS; + + auto CallingConv = MF.getFunction().getCallingConv(); + if (!AMDGPU::isEntryFunctionCC(CallingConv) && + !CullSGPRHazardsOnFunctionBoundary) { + // Callee must consider all SGPRs as tracked. + LLVM_DEBUG(dbgs() << "Is called function, track all SGPRs.\n"); + MachineBasicBlock &EntryBlock = MF.front(); + BlockState[&EntryBlock].In.Tracked.set(); + } + + // Calculate the hazard state for each basic block. + // Iterate until a fixed point is reached. + // Fixed point is guaranteed as merge function only ever increases + // the hazard set, and all backedges will cause a merge. + // + // Note: we have to take care of the entry block as this technically + // has an edge from outside the function. Failure to treat this as + // a merge could prevent fixed point being reached. + SetVector Worklist; + for (auto &MBB : reverse(MF)) + Worklist.insert(&MBB); + while (!Worklist.empty()) { + auto &MBB = *Worklist.pop_back_val(); + bool Changed = runOnMachineBasicBlock(MBB, false); + if (Changed) { + // Note: take a copy of state here in case it is reallocated by map + HazardState NewState = BlockState[&MBB].Out; + // Propagate to all successor blocks + for (auto Succ : MBB.successors()) { + // We only need to merge hazards at CFG merge points. + auto &SuccState = BlockState[Succ]; + if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) { + if (SuccState.In != NewState) { + SuccState.In = NewState; + Worklist.insert(Succ); + } + } else if (SuccState.In.merge(NewState)) { + Worklist.insert(Succ); + } + } + } + } + + LLVM_DEBUG(dbgs() << "Emit s_wait_alu instructions\n"); + + // Final to emit wait instructions. + bool Changed = false; + for (auto &MBB : MF) + Changed |= runOnMachineBasicBlock(MBB, true); + + BlockState.clear(); + return Changed; + } +}; + +class AMDGPUWaitSGPRHazardsLegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPUWaitSGPRHazardsLegacy() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + return AMDGPUWaitSGPRHazards().run(MF); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // namespace + +char AMDGPUWaitSGPRHazardsLegacy::ID = 0; + +char &llvm::AMDGPUWaitSGPRHazardsLegacyID = AMDGPUWaitSGPRHazardsLegacy::ID; + +INITIALIZE_PASS(AMDGPUWaitSGPRHazardsLegacy, DEBUG_TYPE, + "AMDGPU Insert waits for SGPR read hazards", false, false) + +PreservedAnalyses +AMDGPUWaitSGPRHazardsPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + if (AMDGPUWaitSGPRHazards().run(MF)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.h b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.h new file mode 100644 index 0000000000000..58e9bca4c3ede --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.h @@ -0,0 +1,25 @@ +//===--- AMDGPUWaitSGPRHazards.h --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUWAITSGPRHAZARDS_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUWAITSGPRHAZARDS_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class AMDGPUWaitSGPRHazardsPass + : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUWAITSGPRHAZARDS_H diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index d8f441d1ccfe4..4c2c49caad78f 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1872,6 +1872,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { void cvtVOP3(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx); + void cvtScaledMFMA(MCInst &Inst, const OperandVector &Operands); void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); @@ -4853,7 +4854,10 @@ bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const { bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const { auto FB = getFeatureBits(); - if (!FB[AMDGPU::FeatureGFX90AInsts]) + unsigned Opc = Inst.getOpcode(); + // DS_READ_B96_TR_B6 is the only DS instruction in GFX950, that allows + // unaligned VGPR. All others only allow even aligned VGPRs. + if (!(FB[AMDGPU::FeatureGFX90AInsts]) || Opc == AMDGPU::DS_READ_B96_TR_B6_vi) return true; const MCRegisterInfo *MRI = getMRI(); @@ -6765,17 +6769,25 @@ ParseStatus AMDGPUAsmParser::parseTH(OperandVector &Operands, int64_t &TH) { return ParseStatus::Success; } -static void addOptionalImmOperand( - MCInst& Inst, const OperandVector& Operands, - AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx, - AMDGPUOperand::ImmTy ImmT, - int64_t Default = 0) { +static void +addOptionalImmOperand(MCInst &Inst, const OperandVector &Operands, + AMDGPUAsmParser::OptionalImmIndexMap &OptionalIdx, + AMDGPUOperand::ImmTy ImmT, int64_t Default = 0, + std::optional InsertAt = std::nullopt) { auto i = OptionalIdx.find(ImmT); if (i != OptionalIdx.end()) { unsigned Idx = i->second; - ((AMDGPUOperand &)*Operands[Idx]).addImmOperands(Inst, 1); + const AMDGPUOperand &Op = + static_cast(*Operands[Idx]); + if (InsertAt) + Inst.insert(Inst.begin() + *InsertAt, MCOperand::createImm(Op.getImm())); + else + Op.addImmOperands(Inst, 1); } else { - Inst.addOperand(MCOperand::createImm(Default)); + if (InsertAt.has_value()) + Inst.insert(Inst.begin() + *InsertAt, MCOperand::createImm(Default)); + else + Inst.addOperand(MCOperand::createImm(Default)); } } @@ -8794,6 +8806,84 @@ void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands) Inst.getOperand(ModIdx).setImm(ModVal); } } +void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst, + const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; + unsigned Opc = Inst.getOpcode(); + unsigned I = 1; + int CbszOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz); + + const MCInstrDesc &Desc = MII.get(Opc); + + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) + static_cast(*Operands[I++]).addRegOperands(Inst, 1); + + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = static_cast(*Operands[I]); + int NumOperands = Inst.getNumOperands(); + // The order of operands in MCInst and parsed operands are different. + // Adding dummy cbsz and blgp operands at corresponding MCInst operand + // indices for parsing scale values correctly. + if (NumOperands == CbszOpIdx) { + Inst.addOperand(MCOperand::createImm(0)); + Inst.addOperand(MCOperand::createImm(0)); + } + if (isRegOrImmWithInputMods(Desc, NumOperands)) { + Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + } else if (Op.isImmModifier()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + Op.addRegOrImmOperands(Inst, 1); + } + } + + // Insert CBSZ and BLGP operands for F8F6F4 variants + auto CbszIdx = OptionalIdx.find(AMDGPUOperand::ImmTyCBSZ); + if (CbszIdx != OptionalIdx.end()) { + int CbszVal = ((AMDGPUOperand &)*Operands[CbszIdx->second]).getImm(); + Inst.getOperand(CbszOpIdx).setImm(CbszVal); + } + + int BlgpOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp); + auto BlgpIdx = OptionalIdx.find(AMDGPUOperand::ImmTyBLGP); + if (BlgpIdx != OptionalIdx.end()) { + int BlgpVal = ((AMDGPUOperand &)*Operands[BlgpIdx->second]).getImm(); + Inst.getOperand(BlgpOpIdx).setImm(BlgpVal); + } + + // Add dummy src_modifiers + Inst.addOperand(MCOperand::createImm(0)); + Inst.addOperand(MCOperand::createImm(0)); + + // Handle op_sel fields + + unsigned OpSel = 0; + auto OpselIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSel); + if (OpselIdx != OptionalIdx.end()) { + OpSel = static_cast(*Operands[OpselIdx->second]) + .getImm(); + } + + unsigned OpSelHi = 0; + auto OpselHiIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSelHi); + if (OpselHiIdx != OptionalIdx.end()) { + OpSelHi = static_cast(*Operands[OpselHiIdx->second]) + .getImm(); + } + const int16_t ModOps[] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers}; + + for (unsigned J = 0; J < 2; ++J) { + unsigned ModVal = 0; + if (OpSel & (1 << J)) + ModVal |= SISrcMods::OP_SEL_0; + if (OpSelHi & (1 << J)) + ModVal |= SISrcMods::OP_SEL_1; + + const int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + Inst.getOperand(ModIdx).setImm(ModVal); + } +} void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx) { diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 26647dd275582..ba8930e40a6f4 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp AMDGPUCtorDtorLowering.cpp + AMDGPUExpandFeaturePredicates.cpp AMDGPUExportClustering.cpp AMDGPUFrameLowering.cpp AMDGPUGlobalISelDivergenceLowering.cpp @@ -90,6 +91,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUPostLegalizerCombiner.cpp AMDGPUPreLegalizerCombiner.cpp AMDGPUPreloadKernArgProlog.cpp + AMDGPUPreloadKernelArguments.cpp AMDGPUPrintfRuntimeBinding.cpp AMDGPUPromoteAlloca.cpp AMDGPUPromoteKernelArguments.cpp @@ -111,6 +113,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUTargetMachine.cpp AMDGPUTargetObjectFile.cpp AMDGPUTargetTransformInfo.cpp + AMDGPUWaitSGPRHazards.cpp AMDGPUUnifyDivergentExitNodes.cpp AMDGPUUnifyMetadata.cpp R600MachineCFGStructurizer.cpp diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index bc217e10e0fbd..9452b83dfcfc3 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -258,10 +258,13 @@ multiclass DS_1A2D_Off8_RET_mc +class DS_BVH_STACK : DS_Pseudo.ret:$vdst, VGPR_32:$addr), - (ins VGPR_32:$addr_in, getLdStRegisterOperand.ret:$data0, VReg_128:$data1, Offset:$offset), + (outs getLdStRegisterOperand.ret:$vdst, VGPR_32:$addr), + (ins VGPR_32:$addr_in, getLdStRegisterOperand.ret:$data0, + data1_rc:$data1, Offset:$offset), " $vdst, $addr, $data0, $data1$offset"> { let Constraints = "$addr = $addr_in"; let DisableEncoding = "$addr_in"; @@ -723,7 +726,8 @@ def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>; let SubtargetPredicate = isGFX11Plus in { let OtherPredicates = [HasImageInsts] in -def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">; +def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32", + VGPR_32, VReg_128> ; } // let SubtargetPredicate = isGFX11Plus @@ -733,6 +737,13 @@ def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">; let SubtargetPredicate = isGFX12Plus in { +let OtherPredicates = [HasImageInsts] in { +def DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_BVH_STACK< + "ds_bvh_stack_push8_pop1_rtn_b32", VGPR_32, VReg_256>; +def DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_BVH_STACK< + "ds_bvh_stack_push8_pop2_rtn_b64", VReg_64, VReg_256>; +} // End OtherPredicates = [HasImageInsts]. + defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc<"ds_cond_sub_u32">; defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32>; defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">; @@ -1267,6 +1278,11 @@ defm DS_PK_ADD_BF16 : DS_Real_gfx12<0x09b>; defm DS_PK_ADD_RTN_BF16 : DS_Real_gfx12<0x0ab>; defm DS_BPERMUTE_FI_B32 : DS_Real_gfx12<0x0cd>; +defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0, + "ds_bvh_stack_push4_pop1_rtn_b32", true>; +defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>; +defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>; + // New aliases added in GFX12 without renaming the instructions. let AssemblerPredicate = isGFX12Plus in { def : AMDGPUMnemonicAlias<"ds_subrev_u32", "ds_rsub_u32">; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 58cdbe6cf373e..ae1984406588b 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -265,12 +265,14 @@ DECODE_OPERAND_REG_8(VReg_128) DECODE_OPERAND_REG_8(VReg_192) DECODE_OPERAND_REG_8(VReg_256) DECODE_OPERAND_REG_8(VReg_288) +DECODE_OPERAND_REG_8(VReg_320) DECODE_OPERAND_REG_8(VReg_352) DECODE_OPERAND_REG_8(VReg_384) DECODE_OPERAND_REG_8(VReg_512) DECODE_OPERAND_REG_8(VReg_1024) DECODE_OPERAND_REG_7(SReg_32, OPW32) +DECODE_OPERAND_REG_7(SReg_32_XM0, OPW32) DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32) DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32) DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 873d18e30a430..959415648812b 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -14,7 +14,6 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/ScheduleDAG.h" @@ -45,10 +44,6 @@ static cl::opt cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops.")); -static cl::opt MaxExhaustiveHazardSearch( - "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, - cl::desc("Maximum function size for exhausive hazard search")); - //===----------------------------------------------------------------------===// // Hazard Recognizer Implementation //===----------------------------------------------------------------------===// @@ -60,7 +55,6 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), ST(MF.getSubtarget()), TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), - UseVALUReadHazardExhaustiveSearch(false), ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); @@ -1217,7 +1211,6 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixWMMAHazards(MI); fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); - fixVALUReadSGPRHazard(MI); fixRequiredExportPriority(MI); } @@ -2603,20 +2596,24 @@ static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { return NumPasses + 2; } -static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { - // 2 pass -> 5 - // 4 pass -> 7 - // 8 pass -> 11 - // 16 pass -> 19 - return NumPasses + 3; +static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, + bool IsGFX950) { + // xdl def cycles | gfx940 | gfx950 + // 2 pass | 5 5 + // 4 pass | 7 8 + // 8 pass | 11 12 + // 16 pass | 19 20 + return NumPasses + 3 + (NumPasses != 2 && IsGFX950); } -static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { - // 2 pass -> 5 - // 4 pass -> 7 - // 8 pass -> 11 - // 16 pass -> 19 - return NumPasses + 3; +static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, + bool IsGFX950) { + // xdl def cycles | gfx940 | gfx950 + // 2 pass | 5 5 + // 4 pass | 7 8 + // 8 pass | 11 12 + // 16 pass | 19 20 + return NumPasses + 3 + (NumPasses != 2 && IsGFX950); } static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { @@ -2767,7 +2764,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { } else if (ST.hasGFX940Insts()) { NeedWaitStates = isXDL(ST, *MFMA) - ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses) + ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates( + NumPasses, ST.hasGFX950Insts()) : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates( NumPasses); } else { @@ -2853,7 +2851,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { } else if (ST.hasGFX940Insts()) { NeedWaitStates = isXDL(ST, *MFMA) - ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses) + ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates( + NumPasses, ST.hasGFX950Insts()) : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses); } else { switch (NumPasses) { @@ -3106,274 +3105,6 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { return true; } -// Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR. -// i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc -static std::optional sgprPairNumber(Register Reg, - const SIRegisterInfo &TRI) { - switch (Reg) { - case AMDGPU::M0: - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SGPR_NULL: - case AMDGPU::SGPR_NULL64: - return {}; - default: - break; - } - unsigned RegN = TRI.getEncodingValue(Reg); - if (RegN > 127) - return {}; - return (RegN >> 1) & 0x3f; -} - -// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs. -void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) { - assert(MMF == &MF); - - // Assume non-empty vector means it has already been computed. - if (!VALUReadHazardSGPRs.empty()) - return; - - auto CallingConv = MF.getFunction().getCallingConv(); - bool IsCallFree = - AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls(); - - // Exhaustive search is only viable in non-caller/callee functions where - // VALUs will be exposed to the hazard recognizer. - UseVALUReadHazardExhaustiveSearch = - IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None && - MF.getInstructionCount() <= MaxExhaustiveHazardSearch; - - // Consider all SGPRs hazards if the shader uses function calls or is callee. - bool UseVALUUseCache = - IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None; - VALUReadHazardSGPRs.resize(64, !UseVALUUseCache); - if (!UseVALUUseCache) - return; - - // Perform a post ordered reverse scan to find VALUs which read an SGPR - // before a SALU write to the same SGPR. This provides a reduction in - // hazard insertion when all VALU access to an SGPR occurs after its last - // SALU write, when compared to a linear scan. - const MachineRegisterInfo &MRI = MF.getRegInfo(); - BitVector SALUWriteSGPRs(64), ReadSGPRs(64); - MachineCycleInfo CI; - CI.compute(*MMF); - - for (auto *MBB : post_order(&MF)) { - bool InCycle = CI.getCycle(MBB) != nullptr; - for (auto &MI : reverse(MBB->instrs())) { - bool IsVALU = SIInstrInfo::isVALU(MI); - bool IsSALU = SIInstrInfo::isSALU(MI); - if (!IsVALU && !IsSALU) - continue; - - for (const MachineOperand &Op : MI.operands()) { - if (!Op.isReg()) - continue; - Register Reg = Op.getReg(); - assert(!Op.getSubReg()); - // Only consider implicit operands of VCC. - if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO || - Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC)) - continue; - if (!TRI.isSGPRReg(MRI, Reg)) - continue; - auto RegN = sgprPairNumber(Reg, TRI); - if (!RegN) - continue; - if (IsVALU && Op.isUse()) { - // Note: any access within a cycle must be considered a hazard. - if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN])) - VALUReadHazardSGPRs.set(*RegN); - ReadSGPRs.set(*RegN); - } else if (IsSALU) { - if (Op.isDef()) - SALUWriteSGPRs.set(*RegN); - else - ReadSGPRs.set(*RegN); - } - } - } - } -} - -bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) { - if (!ST.hasVALUReadSGPRHazard()) - return false; - - // The hazard sequence is fundamentally three instructions: - // 1. VALU reads SGPR - // 2. SALU writes SGPR - // 3. VALU/SALU reads SGPR - // Try to avoid searching for (1) because the expiry point of the hazard is - // indeterminate; however, the hazard between (2) and (3) can expire if the - // gap contains sufficient SALU instructions with no usage of SGPR from (1). - // Note: SGPRs must be considered as 64-bit pairs as hazard exists - // even if individual SGPRs are accessed. - - bool MIIsSALU = SIInstrInfo::isSALU(*MI); - bool MIIsVALU = SIInstrInfo::isVALU(*MI); - if (!(MIIsSALU || MIIsVALU)) - return false; - - // Avoid expensive search when compile time is priority by - // mitigating every SALU which writes an SGPR. - if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) { - if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI)) - return false; - - const MachineOperand *SDSTOp = - TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); - if (!SDSTOp || !SDSTOp->isReg()) - return false; - - const Register HazardReg = SDSTOp->getReg(); - if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO || - HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0) - return false; - - // Add s_wait_alu sa_sdst(0) after SALU write. - auto NextMI = std::next(MI->getIterator()); - auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), - TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); - - // SALU write may be s_getpc in a bundle. - updateGetPCBundle(NewMI); - - return true; - } - - // Pre-compute set of SGPR pairs read by VALUs. - // Note: pass mutable pointer to MachineFunction for CycleInfo. - computeVALUHazardSGPRs(MI->getMF()); - - // If no VALUs hazard SGPRs exist then nothing to do. - if (VALUReadHazardSGPRs.none()) - return false; - - // All SGPR writes before a call/return must be flushed as the callee/caller - // will not will not see the hazard chain, i.e. (2) to (3) described above. - const bool IsSetPC = (MI->isCall() || MI->isReturn()) && - !(MI->getOpcode() == AMDGPU::S_ENDPGM || - MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED); - - // Collect all SGPR sources for MI which are read by a VALU. - const MachineRegisterInfo &MRI = MF.getRegInfo(); - SmallSet SGPRsUsed; - - if (!IsSetPC) { - for (const MachineOperand &Op : MI->all_uses()) { - Register OpReg = Op.getReg(); - - // Only consider VCC implicit uses on VALUs. - // The only expected SALU implicit access is SCC which is no hazard. - if (MIIsSALU && Op.isImplicit()) - continue; - - if (!TRI.isSGPRReg(MRI, OpReg)) - continue; - - auto RegN = sgprPairNumber(OpReg, TRI); - if (!RegN) - continue; - - if (!VALUReadHazardSGPRs[*RegN]) - continue; - - SGPRsUsed.insert(OpReg); - } - - // No SGPRs -> nothing to do. - if (SGPRsUsed.empty()) - return false; - } - - // A hazard is any SALU which writes one of the SGPRs read by MI. - auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) { - if (!SIInstrInfo::isSALU(I)) - return false; - // Ensure SGPR flush before call/return by conservatively assuming every - // SALU writes an SGPR. - if (IsSetPC && I.getNumDefs() > 0) - return true; - // Check for any register writes. - return any_of(SGPRsUsed, [this, &I](Register Reg) { - return I.modifiesRegister(Reg, &TRI); - }); - }; - - const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11; - auto IsExpiredFn = [&](const MachineInstr &I, int Count) { - if (Count >= SALUExpiryCount) - return true; - // s_wait_alu sa_sdst(0) on path mitigates hazard. - if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) - return true; - return false; - }; - - auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) { - // Only count true SALUs as wait states. - if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I)) - return 0; - // SALU must be unrelated to any hazard registers. - if (any_of(SGPRsUsed, - [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); })) - return 0; - return 1; - }; - - // Check for the hazard. - DenseSet Visited; - int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(), - std::next(MI->getReverseIterator()), 0, - IsExpiredFn, Visited, WaitStatesFn); - - if (WaitStates >= SALUExpiryCount) - return false; - - // Validate hazard through an exhaustive search. - if (UseVALUReadHazardExhaustiveSearch) { - // A hazard is any VALU which reads one of the paired SGPRs read by MI. - // This is searching for (1) in the hazard description. - auto hazardPair = [this](Register Reg) { - if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI) - return Register(AMDGPU::VCC); - auto RegN = sgprPairNumber(Reg, TRI); - return Register(AMDGPU::SGPR0_SGPR1 + *RegN); - }; - auto SearchHazardFn = [this, hazardPair, - &SGPRsUsed](const MachineInstr &I) { - if (!SIInstrInfo::isVALU(I)) - return false; - // Check for any register reads. - return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) { - return I.readsRegister(hazardPair(Reg), &TRI); - }); - }; - auto SearchExpiredFn = [&](const MachineInstr &I, int Count) { - return false; - }; - if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) == - std::numeric_limits::max()) - return false; - } - - // Add s_wait_alu sa_sdst(0) before SALU read. - auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); - - // SALU read may be after s_getpc in a bundle. - updateGetPCBundle(NewMI); - - return true; -} - static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII) { MachineBasicBlock &EntryMBB = MF->front(); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 83ce100c58f0a..bbc55851bf967 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -48,8 +48,6 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { const SIRegisterInfo &TRI; const TargetSchedModel &TSchedModel; bool RunLdsBranchVmemWARHazardFixup; - BitVector VALUReadHazardSGPRs; - bool UseVALUReadHazardExhaustiveSearch; /// RegUnits of uses in the current soft memory clause. BitVector ClauseUses; @@ -109,8 +107,6 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { bool fixWMMAHazards(MachineInstr *MI); bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); - void computeVALUHazardSGPRs(MachineFunction *MMF); - bool fixVALUReadSGPRHazard(MachineInstr *MI); bool fixRequiredExportPriority(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index da065e8d8cb6b..19cdfc01c02c4 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "GCNIterativeScheduler.h" +#include "AMDGPUIGroupLP.h" #include "GCNSchedStrategy.h" #include "SIMachineFunctionInfo.h" @@ -118,6 +119,26 @@ void GCNIterativeScheduler::printSchedRP(raw_ostream &OS, } #endif +void GCNIterativeScheduler::swapIGLPMutations(const Region &R, bool IsReentry) { + bool HasIGLPInstrs = false; + const SIInstrInfo *SII = static_cast(TII); + for (MachineBasicBlock::iterator I = R.Begin; I != R.End; I++) { + if (SII->isIGLPMutationOnly(I->getOpcode())) { + HasIGLPInstrs = true; + break; + } + } + + if (HasIGLPInstrs) { + SavedMutations.clear(); + SavedMutations.swap(Mutations); + auto SchedPhase = IsReentry ? AMDGPU::SchedulingPhase::PreRAReentry + : AMDGPU::SchedulingPhase::Initial; + + addMutation(createIGroupLPDAGMutation(SchedPhase)); + } +} + // DAG builder helper class GCNIterativeScheduler::BuildDAG { GCNIterativeScheduler &Sch; @@ -125,14 +146,15 @@ class GCNIterativeScheduler::BuildDAG { SmallVector BotRoots; public: - BuildDAG(const Region &R, GCNIterativeScheduler &_Sch) - : Sch(_Sch) { + BuildDAG(const Region &R, GCNIterativeScheduler &_Sch, bool IsReentry = false) + : Sch(_Sch) { auto *BB = R.Begin->getParent(); Sch.BaseClass::startBlock(BB); Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs); - + Sch.swapIGLPMutations(R, IsReentry); Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr, /*TrackLaneMask*/true); + Sch.postProcessDAG(); Sch.Topo.InitDAGTopologicalSorting(); Sch.findRootsAndBiasEdges(TopRoots, BotRoots); } @@ -432,13 +454,15 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { auto NewOcc = TargetOcc; for (auto *R : Regions) { + // Always build the DAG to add mutations + BuildDAG DAG(*R, *this); + if (R->MaxPressure.getOccupancy(ST) >= NewOcc) - break; + continue; LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3); printLivenessInfo(dbgs(), R->Begin, R->End, LIS)); - BuildDAG DAG(*R, *this); const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this); const auto MaxRP = getSchedulePressure(*R, MinSchedule); LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n"; @@ -469,8 +493,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( sortRegionsByPressure(TgtOcc); auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); - if (TryMaximizeOccupancy && Occ < TgtOcc) + bool IsReentry = false; + if (TryMaximizeOccupancy && Occ < TgtOcc) { Occ = tryMaximizeOccupancy(TgtOcc); + IsReentry = true; + } // This is really weird but for some magic scheduling regions twice // gives performance improvement @@ -489,7 +516,8 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc); for (auto *R : Regions) { OverrideLegacyStrategy Ovr(*R, LStrgy, *this); - + IsReentry |= I > 0; + swapIGLPMutations(*R, IsReentry); Ovr.schedule(); const auto RP = getRegionPressure(*R); LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); @@ -556,8 +584,11 @@ void GCNIterativeScheduler::scheduleILP( sortRegionsByPressure(TgtOcc); auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); - if (TryMaximizeOccupancy && Occ < TgtOcc) + bool IsReentry = false; + if (TryMaximizeOccupancy && Occ < TgtOcc) { Occ = tryMaximizeOccupancy(TgtOcc); + IsReentry = true; + } TgtOcc = std::min(Occ, TgtOcc); LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, " @@ -566,7 +597,7 @@ void GCNIterativeScheduler::scheduleILP( unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy()); for (auto *R : Regions) { - BuildDAG DAG(*R, *this); + BuildDAG DAG(*R, *this, IsReentry); const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this); const auto RP = getSchedulePressure(*R, ILPSchedule); diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h index c0228540b7a2f..f731b1fc7e0df 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h @@ -77,6 +77,8 @@ class GCNIterativeScheduler : public ScheduleDAGMILive { const StrategyKind Strategy; mutable GCNUpwardRPTracker UPTracker; + std::vector> SavedMutations; + class BuildDAG; class OverrideLegacyStrategy; @@ -91,6 +93,7 @@ class GCNIterativeScheduler : public ScheduleDAGMILive { return getRegionPressure(R.Begin, R.End); } + void swapIGLPMutations(const Region &R, bool IsReentry); void setBestSchedule(Region &R, ScheduleRef Schedule, const GCNRegPressure &MaxRP = GCNRegPressure()); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 6e693066de10b..3db9c683a7d1f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -188,12 +188,6 @@ static void getRegisterPressures( Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); } -// Return true if the instruction is mutually exclusive with all non-IGLP DAG -// mutations, requiring all other mutations to be disabled. -static bool isIGLPMutationOnly(unsigned Opcode) { - return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT; -} - void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, @@ -869,6 +863,8 @@ void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, Pressure[CurRegion] = RPTracker.moveMaxPressure(); if (CurRegion-- == RegionIdx) break; + auto &Rgn = Regions[CurRegion]; + NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); } RPTracker.advanceToNext(); RPTracker.advanceBeforeNext(); @@ -1155,9 +1151,10 @@ bool GCNSchedStage::initGCNRegion() { Unsched.reserve(DAG.NumRegionInstrs); if (StageID == GCNSchedStageID::OccInitialSchedule || StageID == GCNSchedStageID::ILPInitialSchedule) { + const SIInstrInfo *SII = static_cast(DAG.TII); for (auto &I : DAG) { Unsched.push_back(&I); - if (isIGLPMutationOnly(I.getOpcode())) + if (SII->isIGLPMutationOnly(I.getOpcode())) DAG.RegionsWithIGLPInstrs[RegionIdx] = true; } } else { @@ -1905,8 +1902,9 @@ void GCNScheduleDAGMILive::updateRegionBoundaries( } static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { - return any_of(*DAG, [](MachineBasicBlock::iterator MI) { - return isIGLPMutationOnly(MI->getOpcode()); + const SIInstrInfo *SII = static_cast(DAG->TII); + return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) { + return SII->isIGLPMutationOnly(MI->getOpcode()); }); } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index b5e8e246825c7..c6574925bf008 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -52,6 +52,11 @@ static cl::opt cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(2), cl::Hidden); +static cl::opt + CoerceIllegal("amdgpu-coerce-illegal-types", + cl::desc("Whether or not to coerce illegal types"), + cl::ReallyHidden, cl::init(false)); + GCNSubtarget::~GCNSubtarget() = default; GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, @@ -191,6 +196,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, RegBankInfo = std::make_unique(*this); InstSelector = std::make_unique(*this, *RegBankInfo, TM); + + ShouldCoerceIllegalTypes = CoerceIllegal; } const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 342b211199dca..728ed0ab7cbb6 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -190,6 +190,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// indicates a lack of S_CLAUSE support. unsigned MaxHardClauseLength = 0; bool SupportsSRAMECC = false; + bool HasVMemToLDSLoad = false; // This should not be used directly. 'TargetID' tracks the dynamic settings // for SRAMECC. @@ -226,6 +227,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasRestrictedSOffset = false; bool HasBitOp3Insts = false; bool HasPrngInst = false; + bool HasBVHDualAndBVH8Insts = false; bool HasPermlane16Swap = false; bool HasPermlane32Swap = false; bool HasVcmpxPermlaneHazard = false; @@ -259,6 +261,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // Dummy feature to use for assembler in tablegen. bool FeatureDisable = false; + bool ShouldCoerceIllegalTypes = false; + private: SIInstrInfo InstrInfo; SITargetLowering TLInfo; @@ -1312,6 +1316,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return hasGFX950Insts(); } + bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; } + bool hasSALUFloatInsts() const { return HasSALUFloatInsts; } bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; } @@ -1360,6 +1366,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasPrngInst() const { return HasPrngInst; } + bool hasBVHDualAndBVH8Insts() const { return HasBVHDualAndBVH8Insts; } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; @@ -1445,6 +1453,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // of sign-extending. bool hasGetPCZeroExtension() const { return GFX12Insts; } + /// \returns whether or not we should coerce illegal types into vectors of + // legal types for values that span basic blocks. + bool shouldCoerceIllegalTypes() const { return ShouldCoerceIllegalTypes; } + /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { return AMDGPU::IsaInfo::getSGPRAllocGranule(this); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp index 14b3cdf37650c..b467dbb2cd519 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp @@ -70,6 +70,11 @@ MCKernelDescriptor::getDefaultAmdhsaKernelDescriptor(const MCSubtargetInfo *STI, KD.compute_pgm_rsrc1, OneMCExpr, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, Ctx); + + MCKernelDescriptor::bits_set( + KD.compute_pgm_rsrc1, OneMCExpr, + amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS_SHIFT, + amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS, Ctx); } if (AMDGPU::isGFX90A(*STI) && STI->getFeatureBits().test(FeatureTgSplit)) MCKernelDescriptor::bits_set( diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 1b94d6c43392d..24bf1ad8f55ab 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -1509,17 +1509,18 @@ multiclass MIMG_Gather : MIMG_Gather; -class MIMG_IntersectRay_Helper { - int num_addrs = !if(Is64, !if(IsA16, 9, 12), !if(IsA16, 8, 11)); +class MIMG_IntersectRay_Helper { + int num_addrs = !if(isBVH8, 11, !if(Is64, !if(IsA16, 9, 12), !if(IsA16, 8, 11))); RegisterClass RegClass = MIMGAddrSize.RegClass; int VAddrDwords = !srl(RegClass.Size, 5); int GFX11PlusNSAAddrs = !if(IsA16, 4, 5); RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32); list GFX11PlusAddrTypes = - !if(IsA16, - [node_ptr_type, VGPR_32, VReg_96, VReg_96], - [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]); + !cond(isBVH8 : [node_ptr_type, VReg_64, VReg_96, VReg_96, VGPR_32], + isDual : [node_ptr_type, VReg_64, VReg_96, VReg_96, VReg_64], + IsA16 : [node_ptr_type, VGPR_32, VReg_96, VReg_96], + true : [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]); } class MIMG_IntersectRay_gfx10 @@ -1553,15 +1554,28 @@ class MIMG_IntersectRay_nsa_gfx11 addr_types> - : VIMAGE_gfx12 { let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$rsrc, A16:$a16)); let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $rsrc$a16"; -} - -multiclass MIMG_IntersectRay { - defvar info = MIMG_IntersectRay_Helper; + let Constraints = !if(!or(isDual, isBVH8), + "$ray_origin_out = $vaddr2, $ray_dir_out = $vaddr3", ""); + let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$rsrc), + !if(!or(isDual, isBVH8), (ins), (ins A16:$a16))); + let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $rsrc"# + !if(!or(isDual, isBVH8), "", "$a16"); + let SchedRW = !if(!or(isDual, isBVH8), + [WriteVMEM, WriteVMEM, WriteVMEM], [WriteVMEM]); +} + +multiclass MIMG_IntersectRay { + defvar info = MIMG_IntersectRay_Helper; def "" : MIMGBaseOpcode { let BVH = 1; let A16 = IsA16; @@ -1599,7 +1613,9 @@ multiclass MIMG_IntersectRay { } } def _gfx12 : VIMAGE_IntersectRay_gfx12 { + let VDataDwords = !if(!or(isDual, isBVH8), 10, 4); let VAddrDwords = info.num_addrs; } } @@ -1771,15 +1787,20 @@ defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler , "ima let OtherPredicates = [HasImageInsts, HasGFX10_AEncoding] in { defm IMAGE_MSAA_LOAD : MIMG_MSAA_Load , "image_msaa_load">; -defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh_intersect_ray", 0, 0>; -defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay, "image_bvh_intersect_ray", 0, 1>; -defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh64_intersect_ray", 1, 0>; -defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay, "image_bvh64_intersect_ray", 1, 1>; +defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh_intersect_ray", 0, 0, 0>; +defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay, "image_bvh_intersect_ray", 0, 1, 0>; +defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh64_intersect_ray", 1, 0, 0>; +defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay, "image_bvh64_intersect_ray", 1, 1, 0>; } // End OtherPredicates = [HasImageInsts, HasGFX10_AEncoding] +defm IMAGE_BVH_DUAL_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh_dual_intersect_ray", 1, 0, 1>; +defm IMAGE_BVH8_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh8_intersect_ray", 1, 0, 0, 1>; + let SubtargetPredicate = isGFX12Plus in { def : AMDGPUMnemonicAlias<"bvh_intersect_ray", "image_bvh_intersect_ray">; def : AMDGPUMnemonicAlias<"bvh64_intersect_ray", "image_bvh64_intersect_ray">; + def : AMDGPUMnemonicAlias<"bvh_dual_intersect_ray", "image_bvh_dual_intersect_ray">; + def : AMDGPUMnemonicAlias<"bvh8_intersect_ray", "image_bvh8_intersect_ray">; } } // End let OtherPredicates = [HasImageInsts] diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index ac69bf6d038ec..a99fd25477553 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -1069,6 +1069,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { << " is being turned to v_readfirstlane_b32" << " Score: " << C.second.Score << "\n"); Register DstReg = MI->getOperand(0).getReg(); + MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass); + Register SrcReg = MI->getOperand(1).getReg(); unsigned SubReg = MI->getOperand(1).getSubReg(); const TargetRegisterClass *SrcRC = @@ -1092,7 +1094,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { Result, *MRI, MI->getOperand(1), SrcRC, TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass); Register PartialDst = - MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MBB, *Result, Result->getDebugLoc(), TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst) .addReg(PartialSrc); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 2bc19137b1ca0..cccccfdc070d3 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -346,6 +346,12 @@ bool SIFoldOperandsImpl::canUseImmWithOpSel(FoldCandidate &Fold) const { case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + // VOP3 packed instructions ignore op_sel source modifiers, we cannot encode + // two different constants. + if ((TSFlags & SIInstrFlags::VOP3) && !(TSFlags & SIInstrFlags::VOP3P) && + static_cast(Fold.ImmToFold) != + static_cast(Fold.ImmToFold >> 16)) + return false; break; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 3e95c672808d6..09448a284b7f7 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -439,6 +439,7 @@ class PrologEpilogSGPRSpillBuilder { buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR, FI, FrameReg, DwordOff); + MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass); BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) .addReg(TmpVGPR, RegState::Kill); DwordOff += 4; @@ -1159,6 +1160,7 @@ void SIFrameLowering::emitCSRSpillStores(MachineFunction &MF, const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); const MCRegisterInfo *MCRI = MF.getContext().getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch // registers. However, save all lanes of callee-saved VGPRs. Due to this, we @@ -1187,6 +1189,12 @@ void SIFrameLowering::emitCSRSpillStores(MachineFunction &MF, } }; + for (const Register Reg : make_first_range(WWMScratchRegs)) { + if (!MRI.isReserved(Reg)) { + MRI.addLiveIn(Reg); + MBB.addLiveIn(Reg); + } + } StoreWWMRegisters(WWMScratchRegs); if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b632c50dae0e3..2c2e76a74da2d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -869,8 +869,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasMinimum3Maximum3F32()) setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal); - if (Subtarget->hasMinimum3Maximum3PKF16()) + if (Subtarget->hasMinimum3Maximum3PKF16()) { setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal); + + // If only the vector form is available, we need to widen to a vector. + if (!Subtarget->hasMinimum3Maximum3F16()) + setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom); + } } setOperationAction(ISD::INTRINSIC_WO_CHAIN, @@ -911,9 +916,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal); } - if (Subtarget->hasCvtPkF16F32Inst()) { - setOperationAction(ISD::FP_ROUND, MVT::v2f16, Legal); - } + if (Subtarget->hasCvtPkF16F32Inst()) + setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom); setTargetDAGCombine({ISD::ADD, ISD::UADDO_CARRY, @@ -1380,9 +1384,15 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } - case Intrinsic::amdgcn_image_bvh_intersect_ray: { + case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: + case Intrinsic::amdgcn_image_bvh_intersect_ray: + case Intrinsic::amdgcn_image_bvh8_intersect_ray: { Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT? + Info.memVT = + MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray + ? CI.getType() + : cast(CI.getType()) + ->getElementType(0)); // XXX: what is correct VT? Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; Info.align.reset(); @@ -1451,7 +1461,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } - case Intrinsic::amdgcn_ds_bvh_stack_rtn: { + case Intrinsic::amdgcn_ds_bvh_stack_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: + case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: { Info.opc = ISD::INTRINSIC_W_CHAIN; const GCNTargetMachine &TM = @@ -4563,7 +4576,8 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, const TargetRegisterClass *BoolRC = TRI->getBoolRC(); Register PhiExec = MRI.createVirtualRegister(BoolRC); Register NewExec = MRI.createVirtualRegister(BoolRC); - Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register CurrentIdxReg = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); Register CondReg = MRI.createVirtualRegister(BoolRC); BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) @@ -4608,7 +4622,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, } else { // Move index from VCC into M0 if (Offset == 0) { - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) .addReg(CurrentIdxReg, RegState::Kill); } else { BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) @@ -4722,7 +4736,7 @@ static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, if (Offset == 0) { // clang-format off - BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) .add(*Idx); // clang-format on } else { @@ -4990,7 +5004,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); Register FF1Reg = MRI.createVirtualRegister(DstRegClass); - Register LaneValueReg = MRI.createVirtualRegister(DstRegClass); + Register LaneValueReg = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); bool IsWave32 = ST.isWave32(); unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; @@ -5249,18 +5264,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) { - Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0) .addReg(Src0.getReg()); Src0.setReg(RegOp0); } if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) { - Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1) .addReg(Src1.getReg()); Src1.setReg(RegOp1); } - Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); if (TRI->isVectorRegister(MRI, Src2.getReg())) { BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2) .addReg(Src2.getReg()); @@ -5316,9 +5331,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return BB; } case AMDGPU::SI_INIT_M0: { + MachineOperand &M0Init = MI.getOperand(0); BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), - TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .add(MI.getOperand(0)); + TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32), + AMDGPU::M0) + .add(M0Init); MI.eraseFromParent(); return BB; } @@ -5963,6 +5980,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMINNUM: case ISD::FMAXNUM: return lowerFMINNUM_FMAXNUM(Op, DAG); + case ISD::FMINIMUM: + case ISD::FMAXIMUM: + return lowerFMINIMUM_FMAXIMUM(Op, DAG); case ISD::FLDEXP: case ISD::STRICT_FLDEXP: return lowerFLDEXP(Op, DAG); @@ -5984,8 +6004,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMUL: case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: - case ISD::FMINIMUM: - case ISD::FMAXIMUM: case ISD::FMINIMUMNUM: case ISD::FMAXIMUMNUM: case ISD::UADDSAT: @@ -6801,11 +6819,18 @@ SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, } SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + EVT DstVT = Op.getValueType(); + + if (DstVT == MVT::v2f16) { + assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32"); + return SrcVT == MVT::v2f32 ? Op : SDValue(); + } + assert(Op.getValueType() == MVT::f16 && "Do not know how to custom lower FP_ROUND for non-f16 type"); - SDValue Src = Op.getOperand(0); - EVT SrcVT = Src.getValueType(); if (SrcVT != MVT::f64) return Op; @@ -6840,6 +6865,34 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, return Op; } +SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + if (VT.isVector()) + return splitBinaryVectorOp(Op, DAG); + + assert(!Subtarget->hasIEEEMinMax() && !Subtarget->hasMinimum3Maximum3F16() && + Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 && + "should not need to widen f16 minimum/maximum to v2f16"); + + // Widen f16 operation to v2f16 + + // fminimum f16:x, f16:y -> + // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x)) + // (v2f16 (scalar_to_vector y))), 0 + SDLoc SL(Op); + SDValue WideSrc0 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0)); + SDValue WideSrc1 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1)); + + SDValue Widened = + DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened, + DAG.getConstant(0, SL, MVT::i32)); +} + SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP; EVT VT = Op.getValueType(); @@ -8619,6 +8672,11 @@ SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op, if (MaxID == 0) return DAG.getConstant(0, SL, MVT::i32); + // It's undefined behavior if a function marked with the amdgpu-no-* + // attributes uses the corresponding intrinsic. + if (!Arg) + return DAG.getUNDEF(Op->getValueType(0)); + SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), Arg); @@ -9402,6 +9460,51 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: + case Intrinsic::amdgcn_image_bvh8_intersect_ray: { + MemSDNode *M = cast(Op); + SDValue NodePtr = M->getOperand(2); + SDValue RayExtent = M->getOperand(3); + SDValue InstanceMask = M->getOperand(4); + SDValue RayOrigin = M->getOperand(5); + SDValue RayDir = M->getOperand(6); + SDValue Offsets = M->getOperand(7); + SDValue TDescr = M->getOperand(8); + + assert(NodePtr.getValueType() == MVT::i64); + assert(RayDir.getValueType() == MVT::v3f32); + + if (!Subtarget->hasBVHDualAndBVH8Insts()) { + emitRemovedIntrinsicError(DAG, DL, Op.getValueType()); + return SDValue(); + } + + bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray; + const unsigned NumVDataDwords = 10; + const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12; + int Opcode = AMDGPU::getMIMGOpcode( + IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY + : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY, + AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords); + assert(Opcode != -1); + + SmallVector Ops; + Ops.push_back(NodePtr); + Ops.push_back(DAG.getBuildVector( + MVT::v2i32, DL, + {DAG.getBitcast(MVT::i32, RayExtent), + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)})); + Ops.push_back(RayOrigin); + Ops.push_back(RayDir); + Ops.push_back(Offsets); + Ops.push_back(TDescr); + Ops.push_back(M->getChain()); + + auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops); + MachineMemOperand *MemRef = M->getMemOperand(); + DAG.setNodeMemRefs(NewNode, {MemRef}); + return SDValue(NewNode, 0); + } case Intrinsic::amdgcn_image_bvh_intersect_ray: { MemSDNode *M = cast(Op); SDValue NodePtr = M->getOperand(2); @@ -9963,7 +10066,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: case Intrinsic::amdgcn_struct_buffer_load_lds: case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { - assert(!AMDGPU::isGFX12Plus(*Subtarget)); + if (!Subtarget->hasVMemToLDSLoad()) + return SDValue(); unsigned Opc; bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds || @@ -10070,6 +10174,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(Load, 0); } case Intrinsic::amdgcn_global_load_lds: { + if (!Subtarget->hasVMemToLDSLoad()) + return SDValue(); + unsigned Opc; unsigned Size = Op->getConstantOperandVal(4); switch (Size) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 1cd7f1b29e077..9b2c14862407a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -146,6 +146,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const; SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const; SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index de2095fa60ffd..e8bf474a0d416 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1734,7 +1734,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // LOAD_CNT is only relevant to vgpr or LDS. unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; - bool FoundAliasingStore = false; // Only objects with alias scope info were added to LDSDMAScopes array. // In the absense of the scope info we will not be able to disambiguate // aliasing here. There is no need to try searching for a corresponding @@ -1744,14 +1743,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) { const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { - if (MI.mayAlias(AA, *LDSDMAStores[I], true)) { - FoundAliasingStore = true; + if (MI.mayAlias(AA, *LDSDMAStores[I], true)) ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait); - } } - } - if (!FoundAliasingStore) + } else { ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); + } if (Memop->isStore()) { ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 4d90e7fb40ee3..37847ffb6da0e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2239,6 +2239,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::SI_RESTORE_S32_FROM_VGPR: MI.setDesc(get(AMDGPU::V_READLANE_B32)); + MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(), + &AMDGPU::SReg_32_XM0RegClass); break; case AMDGPU::V_MOV_B64_PSEUDO: { @@ -2418,11 +2420,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { assert(ST.useVGPRIndexMode()); Register VecReg = MI.getOperand(0).getReg(); bool IsUndef = MI.getOperand(1).isUndef(); - Register Idx = MI.getOperand(3).getReg(); + MachineOperand &Idx = MI.getOperand(3); Register SubReg = MI.getOperand(4).getImm(); MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) - .addReg(Idx) + .add(Idx) .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); SetOn->getOperand(3).setIsUndef(); @@ -6527,7 +6529,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, Register VScalarOp = ScalarOp->getReg(); if (NumSubRegs == 1) { - Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg) .addReg(VScalarOp); @@ -6559,8 +6561,10 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, "Unhandled register size"); for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) { - Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register CurRegLo = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register CurRegHi = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); // Read the next variant <- also loop target. BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo) @@ -7667,9 +7671,20 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, if (Inst.isCopy() && DstReg.isPhysical() && RI.isVGPR(MRI, Inst.getOperand(1).getReg())) { // TODO: Only works for 32 bit registers. - BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), - get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg()) - .add(Inst.getOperand(1)); + if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) { + BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), + get(AMDGPU::V_READFIRSTLANE_B32), DstReg) + .add(Inst.getOperand(1)); + } else { + Register NewDst = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), + get(AMDGPU::V_READFIRSTLANE_B32), NewDst) + .add(Inst.getOperand(1)); + BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), + DstReg) + .addReg(NewDst); + } Inst.eraseFromParent(); return; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 846fee0d99e73..50f6c864492aa 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -993,6 +993,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isIGLP(const MachineInstr &MI) const { return isIGLP(MI.getOpcode()); } + // Return true if the instruction is mutually exclusive with all non-IGLP DAG + // mutations, requiring all other mutations to be disabled. + bool isIGLPMutationOnly(unsigned Opcode) const { + return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT; + } + static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) { switch (Opcode) { case AMDGPU::S_WAITCNT_soft: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index bb78e77a9dc1a..0d89b1270465c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1803,19 +1803,16 @@ class getVOP3SrcForVT { 1 : VSrc_b32); } -// Returns the vreg register class to use for sources of VOP3 instructions for the -// given VT. -class getVOP3VRegSrcForVT { - RegisterOperand ret = - !cond(!eq(VT.Size, 128) : RegisterOperand, - !eq(VT.Size, 96) : RegisterOperand, - !eq(VT.Size, 64) : RegisterOperand, - !eq(VT.Size, 48) : RegisterOperand, - !eq(VT.Size, 16) : !if(IsTrue16, - !if(IsFake16, RegisterOperand, - RegisterOperand), - RegisterOperand), - 1 : RegisterOperand); +// VGPR only VOP3 src with 9 bit encoding +class getVOP3VRegSrcForVT { + RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024, + !eq(VT.Size, 512) : VRegSrc_512, + !eq(VT.Size, 256) : VRegSrc_256, + !eq(VT.Size, 192) : VRegSrc_192, + !eq(VT.Size, 128) : VRegSrc_128, + !eq(VT.Size, 96) : VRegSrc_96, + !eq(VT.Size, 64) : VRegSrc_64, + 1 : VRegSrc_32); } // Src2 of VOP3 DPP instructions cannot be a literal @@ -2852,6 +2849,7 @@ def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>; def VOP_V2I16_V2F16_F32 : VOPProfile<[v2i16, v2f16, f32, untyped]>; def VOP_V2I16_V2BF16_F32 : VOPProfile<[v2i16, v2bf16, f32, untyped]>; def VOP_I32_F32_F32_F32 : VOPProfile<[i32, f32, f32, f32]>; +def VOP_I32_V2F32_I32_F32 : VOPProfile<[i32, v2f32, i32, f32]>; def VOP_I32_V2F16_F32_F32 : VOPProfile<[i32, v2f16, f32, f32]>; def VOP_I32_V2BF16_F32_F32: VOPProfile<[i32, v2bf16, f32, f32]>; def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index b4bf72dbbd28c..ea3f1374f8ca1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1839,7 +1839,14 @@ def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; - +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; def : BitConvert ; @@ -4182,9 +4189,25 @@ def G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPUGenericInstruction { let mayStore = 1; } -def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction { +def G_AMDGPU_BVH_INTERSECT_RAY : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); - let InOperandList = (ins unknown:$intrin, variable_ops); + let InOperandList = (ins unknown:$opcode, variable_ops); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 0; +} + +def G_AMDGPU_BVH_DUAL_INTERSECT_RAY : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst, type1:$ray_origin, type1:$ray_dir); + let InOperandList = (ins unknown:$opcode, variable_ops); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 0; +} + +def G_AMDGPU_BVH8_INTERSECT_RAY : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst, type1:$ray_origin, type1:$ray_dir); + let InOperandList = (ins unknown:$opcode, variable_ops); let hasSideEffects = 0; let mayLoad = 1; let mayStore = 0; diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 212edff097837..c67652cfaf728 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -37,6 +37,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) { IEEEMode = 0; WgpMode = 0; MemOrdered = 0; + FwdProgress = 0; RrWgMode = 0; ScratchSize = ZeroExpr; @@ -90,6 +91,10 @@ static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo, if (ST.hasIEEEMode()) Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + // TODO: in the long run we will want to enable this unconditionally. + if (ST.getTargetTriple().getOS() == Triple::OSType::AMDHSA) + Reg |= S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress); + if (ST.hasRrWGMode()) Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode); diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index c358a2d9db10b..6f74615ab8435 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -38,9 +38,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { uint32_t DX10Clamp = 0; uint32_t DebugMode = 0; uint32_t IEEEMode = 0; - uint32_t WgpMode = 0; // GFX10+ - uint32_t MemOrdered = 0; // GFX10+ - uint32_t RrWgMode = 0; // GFX12+ + uint32_t WgpMode = 0; // GFX10+ + uint32_t MemOrdered = 0; // GFX10+ + uint32_t FwdProgress = 0; // GFX10+ + uint32_t RrWgMode = 0; // GFX12+ const MCExpr *ScratchSize = nullptr; // State used to calculate fields set in PGM_RSRC2 pm4 packet. diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index d716edbb8ff2d..fd27ae399dafc 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2296,6 +2296,8 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, // Don't need to write VGPR out. } + MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); + // Restore clobbered registers in the specified restore block. MI = RestoreMBB.end(); SB.setMI(&RestoreMBB, MI); @@ -2310,6 +2312,7 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, SB.NumSubRegs == 1 ? SB.SuperReg : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); + MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass); bool LastSubReg = (i + 1 == e); auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) @@ -3149,10 +3152,15 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (IsSALU && !LiveSCC) Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead. if (IsSALU && LiveSCC) { - Register NewDest = - IsCopy ? ResultReg - : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass, - Shift, false, 0); + Register NewDest; + if (IsCopy) { + MF->getRegInfo().constrainRegClass(ResultReg, + &AMDGPU::SReg_32_XM0RegClass); + NewDest = ResultReg; + } else { + NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, + Shift, false, 0); + } BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest) .addReg(TmpResultReg); ResultReg = NewDest; @@ -3275,10 +3283,16 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addReg(TmpResultReg); } - Register NewDest = IsCopy ? ResultReg - : RS->scavengeRegisterBackwards( - AMDGPU::SReg_32RegClass, *Add, - false, 0, /*AllowSpill=*/true); + Register NewDest; + if (IsCopy) { + MF->getRegInfo().constrainRegClass(ResultReg, + &AMDGPU::SReg_32_XM0RegClass); + NewDest = ResultReg; + } else { + NewDest = RS->scavengeRegisterBackwards( + AMDGPU::SReg_32_XM0RegClass, *Add, false, 0, + /*AllowSpill=*/true); + } BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest) .addReg(TmpResultReg); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b4cdc83e072b6..d369a9fe2b11a 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -134,6 +134,12 @@ unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) { return VersionMajor >= 12 ? 8 : 0; } +/// \returns VaSdst bit width +inline unsigned getVaSdstBitWidth() { return 3; } + +/// \returns VaSdst bit shift +inline unsigned getVaSdstBitShift() { return 9; } + /// \returns VmVsrc bit width inline unsigned getVmVsrcBitWidth() { return 3; } @@ -146,6 +152,12 @@ inline unsigned getVaVdstBitWidth() { return 4; } /// \returns VaVdst bit shift inline unsigned getVaVdstBitShift() { return 12; } +/// \returns VaVcc bit width +inline unsigned getVaVccBitWidth() { return 1; } + +/// \returns VaVcc bit shift +inline unsigned getVaVccBitShift() { return 1; } + /// \returns SaSdst bit width inline unsigned getSaSdstBitWidth() { return 1; } @@ -1306,7 +1318,7 @@ void initDefaultAMDKernelCodeT(AMDGPUMCKernelCodeT &KernelCode, if (Version.Major >= 10) { KernelCode.compute_pgm_resource_registers |= S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) | - S_00B848_MEM_ORDERED(1); + S_00B848_MEM_ORDERED(1) | S_00B848_FWD_PROGRESS(1); } } @@ -1719,6 +1731,14 @@ unsigned decodeFieldSaSdst(unsigned Encoded) { return unpackBits(Encoded, getSaSdstBitShift(), getSaSdstBitWidth()); } +unsigned decodeFieldVaSdst(unsigned Encoded) { + return unpackBits(Encoded, getVaSdstBitShift(), getVaSdstBitWidth()); +} + +unsigned decodeFieldVaVcc(unsigned Encoded) { + return unpackBits(Encoded, getVaVccBitShift(), getVaVccBitWidth()); +} + unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc) { return packBits(VmVsrc, Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth()); } @@ -1743,6 +1763,22 @@ unsigned encodeFieldSaSdst(unsigned SaSdst) { return encodeFieldSaSdst(0xffff, SaSdst); } +unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst) { + return packBits(VaSdst, Encoded, getVaSdstBitShift(), getVaSdstBitWidth()); +} + +unsigned encodeFieldVaSdst(unsigned VaSdst) { + return encodeFieldVaSdst(0xffff, VaSdst); +} + +unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc) { + return packBits(VaVcc, Encoded, getVaVccBitShift(), getVaVccBitWidth()); +} + +unsigned encodeFieldVaVcc(unsigned VaVcc) { + return encodeFieldVaVcc(0xffff, VaVcc); +} + } // namespace DepCtr //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 3c9246d5e107d..fad7e67ff3c76 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1168,6 +1168,12 @@ unsigned decodeFieldVmVsrc(unsigned Encoded); /// \returns Decoded SaSdst from given immediate \p Encoded. unsigned decodeFieldSaSdst(unsigned Encoded); +/// \returns Decoded VaSdst from given immediate \p Encoded. +unsigned decodeFieldVaSdst(unsigned Encoded); + +/// \returns Decoded VaVcc from given immediate \p Encoded. +unsigned decodeFieldVaVcc(unsigned Encoded); + /// \returns \p VmVsrc as an encoded Depctr immediate. unsigned encodeFieldVmVsrc(unsigned VmVsrc); @@ -1186,6 +1192,18 @@ unsigned encodeFieldSaSdst(unsigned SaSdst); /// \returns \p Encoded combined with encoded \p SaSdst. unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst); +/// \returns \p VaSdst as an encoded Depctr immediate. +unsigned encodeFieldVaSdst(unsigned VaSdst); + +/// \returns \p Encoded combined with encoded \p VaSdst. +unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst); + +/// \returns \p VaVcc as an encoded Depctr immediate. +unsigned encodeFieldVaVcc(unsigned VaVcc); + +/// \returns \p Encoded combined with encoded \p VaVcc. +unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc); + } // namespace DepCtr namespace Exp { diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index b9c73e6ce8ef2..2afc7bc5ea3b6 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -243,7 +243,7 @@ defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>; } // End isMoveImm = 1 def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> { - let DstRC = RegisterOperand; + let DstRC = RegisterOperand; let Src0RC32 = VRegOrLdsSrc_32; let Asm32 = " $vdst, $src0"; } @@ -317,7 +317,7 @@ defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>; let ReadsModeReg = 0, mayRaiseFPException = 0 in { defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; -defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32>; +defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32, int_amdgcn_cvt_off_f32_i4>; } // End ReadsModeReg = 0, mayRaiseFPException = 0 } // End SchedRW = [WriteFloatCvt] @@ -391,8 +391,8 @@ def VOP_PERMLANE_SWAP : VOPProfile<[i32, i32, untyped, untyped]> { let HasExtDPP = 0; let HasExtSDWA = 0; - let Ins32 = (ins Src0RC64:$vdst_in, Src0RC32:$src0); - let Ins64 = (ins Src0RC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl); + let Ins32 = (ins DstRC:$vdst_in, Src0RC32:$src0); + let Ins64 = (ins DstRC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl); let InsVOP3OpSel = (ins Src0RC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl); let Asm64 = "$vdst, $src0$bound_ctrl$fi"; let AsmVOP3OpSel = "$vdst, $src0$bound_ctrl$fi"; @@ -787,7 +787,8 @@ defm V_PRNG_B32 : VOP1Inst <"v_prng_b32", VOP_I32_I32, int_amdgcn_prng_b32>; let Constraints = "$vdst = $vdst_in, $src0_out = $src0", DisableEncoding="$vdst_in,$src0_out", - SchedRW = [Write32Bit, Write32Bit] in { + SchedRW = [Write32Bit, Write32Bit], + isConvergent = 1 in { let SubtargetPredicate = HasPermlane16Swap in { defm V_PERMLANE16_SWAP_B32 : VOP1Inst<"v_permlane16_swap_b32", VOP_PERMLANE_SWAP>; } @@ -1569,8 +1570,11 @@ defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>; defm V_CVT_PK_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>; defm V_PRNG_B32 : VOP1_Real_gfx9 <0x58>; + +let isConvergent = 1 in { defm V_PERMLANE16_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x059>; defm V_PERMLANE32_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x05a>; +} class MovDPP8Pattern : GCNPat < (vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)), diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 900c91731aa1b..1bac8656192a7 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -418,12 +418,27 @@ def VOP_MADMK_F16_fake16 : VOP_MADMK { } def VOP_MADMK_F32 : VOP_MADMK ; +// Returns the vreg register class to use for sources of VOP3 instructions for the +// given VT. +class getVOP3VRegForVT { + RegisterOperand ret = + !cond(!eq(VT.Size, 128) : RegisterOperand, + !eq(VT.Size, 96) : RegisterOperand, + !eq(VT.Size, 64) : RegisterOperand, + !eq(VT.Size, 48) : RegisterOperand, + !eq(VT.Size, 16) : !if(IsTrue16, + !if(IsFake16, RegisterOperand, + RegisterOperand), + RegisterOperand), + 1 : RegisterOperand); +} + // FIXME: Remove src2_modifiers. It isn't used, so is wasting memory // and processing time but it makes it easier to convert to mad. class VOP_MAC : VOPProfile <[vt0, vt1, vt1, vt0]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT.ret:$src2); // Src2 must accept the same operand types as vdst, namely VGPRs only - let Src2RC64 = getVOP3VRegSrcForVT.ret; + let Src2RC64 = getVOP3VRegForVT.ret; let Ins64 = getIns64.ret; @@ -748,7 +763,7 @@ def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> { } def VOP_READLANE : VOPProfile<[i32, i32, i32, untyped]> { - let Outs32 = (outs SReg_32:$vdst); + let Outs32 = (outs SReg_32_XM0:$vdst); let Outs64 = Outs32; let Ins32 = (ins VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1); let Ins64 = Ins32; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index c06c932a5375e..aa4399c9cdf76 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -861,10 +861,12 @@ class SrcAndDstSelToOpSelXForm : SDNodeXFormgetZExtValue(); unsigned New = 0; if (}] # modifier_idx # [{ == 0) { - New = (}] # dest_sel # [{ == 1) ? ((Val & 0x2) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) - : ((Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); - } else if (}] # modifier_idx # [{== 1 || }] # modifier_idx # [{ == 2) { - New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + New = (}] # dest_sel # [{ == 1) ? ((Val & 0x1) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) + : ((Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); + } else if (}] # modifier_idx # [{== 1) { + New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + } if (}] # modifier_idx # [{== 2) { + New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; } return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32); }]>; @@ -1027,7 +1029,11 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile : let HasFP4DstByteSel = 1; } -def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile, VOP3_OPSEL> { +class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile + : VOP3_Profile { + + let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VRegSrcForVT.ret, + getVOP3SrcForVT.ret); let InsVOP3OpSel = (ins PackedF32InputMods: $src0_modifiers, Src0RC64:$src0, Int32InputMods: $src1_modifiers, Src1RC64:$src1, FP32InputMods: $src2_modifiers, Src2RC64:$src2, @@ -1075,6 +1081,11 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile : VOP3_Profile

{ let HasExt32BitDPP = 0; let HasExtVOP3DPP = 0; let HasExt64BitDPP = 0; + + // All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for + // any operand slots > 32 bit. + let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VRegSrcForVT.ret, + getVOP3SrcForVT.ret); } let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in { @@ -1116,7 +1127,10 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in let Constraints = "@earlyclobber $vdst" in { defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile>; defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile>; - defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>; + defm V_CVT_SCALEF32_SR_PK_FP4_F32 + : VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", + VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile< + VOP_I32_V2F32_I32_F32>>; } } defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 689f3b38a1723..415a322a122ab 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -785,12 +785,12 @@ class MFMA_F8F6F4_WithSizeTable_Helper : // Currently assumes scaled instructions never have abid class MAIFrag : PatFrag < !if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp, - node:$scale_src0_opsel, node:$scale_src0, - node:$scale_src1_opsel, node:$scale_src1), + node:$src0_modifiers, node:$scale_src0, + node:$src1_modifiers, node:$scale_src1), !con((ops node:$src0, node:$src1, node:$src2, node:$cbsz), !if(HasAbid, (ops node:$abid), (ops)), (ops node:$blgp))), - !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $scale_src0_opsel, $scale_src0, $scale_src1_opsel, $scale_src1), + !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1), !if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp), (Op $src0, $src1, $src2, $cbsz, $blgp))), pred @@ -848,15 +848,20 @@ class MAIInst : MAIInst { // Append operands from V_MFMA_LD_SCALE_B32, but we need to rename them. + // Restrict to VGPR only (VRegSrc_32) for the scale operands to workaround a + // hardware design defect: For all Inline/SGPR constants, SP HW use bits + // [30:23] as the scale. + // TODO: We may still be able to allow Inline Constants/SGPR, with a proper + // shift, to obtain a potentially better performance. let InOperandList = !con(BaseInst.InOperandList, - (ins VSrc_b32:$scale_src0, - VSrc_b32:$scale_src1, - op_sel0:$scale_src0_opsel, - op_sel_hi0:$scale_src1_opsel)); + (ins VRegSrc_32:$scale_src0, + VRegSrc_32:$scale_src1, + op_sel0:$src0_modifiers, + op_sel_hi0:$src1_modifiers)); let AsmOperands = "$vdst, $src0, $src1, $src2, $scale_src0, $scale_src1" - "$scale_src0_opsel$scale_src1_opsel$cbsz$blgp"; - + "$src0_modifiers$src1_modifiers$cbsz$blgp"; + let AsmMatchConverter = "cvtScaledMFMA"; let FixedSize = 1; let Size = 16; } @@ -1997,7 +2002,6 @@ multiclass VOP3PX_Real_ScaledMFMA op> { defvar PS_VCD = !cast(NAME # "_vgprcd" # "_e64"); defvar Name = PS_ACD.Mnemonic; defvar F8F8Name = !substr(NAME, 0, !sub(!size(NAME), !size("_fN_fM")))#"_f8_f8"; - let SubtargetPredicate = HasGFX950Insts, DecoderNamespace = "GFX940", AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in { diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 39f6fd25ddb33..188cac4a1527e 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -15,6 +15,7 @@ class LetDummies { bit isConvertibleToThreeAddress; bit isMoveImm; bit isReMaterializable; + bit isConvergent; bit isAsCheapAsAMove; bit FPDPRounding; Predicate SubtargetPredicate; @@ -528,14 +529,16 @@ class VOP3PXe op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_ bits<9> scale_src0; bits<9> scale_src1; - bits<2> scale_src0_opsel; - bits<2> scale_src1_opsel; + //MFMALdScaleModifierOp transforms 2 bit opsel input to 4 bit value + //where opsel and opselHi are in 3rd and 4th bit. + bits<4> src0_modifiers; + bits<4> src1_modifiers; // Inst{7-0} = unused // Inst{10-8} = neg_hi; // Inst{13-11} = op_sel - let Inst{11} = scale_src0_opsel{0}; - let Inst{12} = scale_src1_opsel{0}; + let Inst{11} = src0_modifiers{2}; //opsel[0] + let Inst{12} = src1_modifiers{2}; //opsel[1] // Inst{13} = unused op_sel // Inst{14} = unused op_sel_hi2 @@ -544,8 +547,8 @@ class VOP3PXe op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_ let Inst{49-41} = scale_src1; // Inst{50-58} = unused // Inst{60-59} = op_sel_hi; - let Inst{59} = scale_src0_opsel{1}; - let Inst{60} = scale_src1_opsel{1}; + let Inst{59} = src0_modifiers{3}; //opsel_hi[0] + let Inst{60} = src1_modifiers{3}; //opsel_hi[1] // Inst{63-61} = neg; // The high half of the encoding is the unscaled mfma op. @@ -1433,17 +1436,17 @@ class getVOP3MAIScaledPat { // mfma [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, timm:$cbsz, timm:$blgp, - MFMALdScaleModifierOp:$scale_src0_opsel, + MFMALdScaleModifierOp:$src0_modifiers, i32:$scale_src0, - MFMALdScaleModifierOp:$scale_src1_opsel, + MFMALdScaleModifierOp:$src1_modifiers, i32:$scale_src1 ))], // smfmac [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx, timm:$cbsz, timm:$abid, - MFMALdScaleModifierOp:$scale_src0_opsel, + MFMALdScaleModifierOp:$src0_modifiers, i32:$scale_src0, - MFMALdScaleModifierOp:$scale_src1_opsel, + MFMALdScaleModifierOp:$src1_modifiers, i32:$scale_src1))]); } diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 78db8413e62c9..c202f7fa93db6 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -284,7 +284,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, // Adjust stack pointer. int StackAdj = StackAdjust.getImm(); int MaxTCDelta = X86FI->getTCReturnAddrDelta(); - int Offset = 0; + int64_t Offset = 0; assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive"); // Incoporate the retaddr area. @@ -297,7 +297,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, if (Offset) { // Check for possible merge with preceding ADD instruction. - Offset += X86FL->mergeSPUpdates(MBB, MBBI, true); + Offset = X86FL->mergeSPAdd(MBB, MBBI, Offset, true); X86FL->emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue=*/true); } diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index a7b60afb7f547..350218dcdc815 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -223,6 +223,8 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { return false; } +constexpr int64_t MaxSPChunk = (1LL << 31) - 1; + /// emitSPUpdate - Emit a series of instructions to increment / decrement the /// stack pointer by a constant value. void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, @@ -242,7 +244,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, return; } - uint64_t Chunk = (1LL << 31) - 1; + uint64_t Chunk = MaxSPChunk; MachineFunction &MF = *MBB.getParent(); const X86Subtarget &STI = MF.getSubtarget(); @@ -391,12 +393,15 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( return MI; } -int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - bool doMergeWithPrevious) const { +template +int64_t X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + FoundT FoundStackAdjust, + CalcT CalcNewOffset, + bool doMergeWithPrevious) const { if ((doMergeWithPrevious && MBBI == MBB.begin()) || (!doMergeWithPrevious && MBBI == MBB.end())) - return 0; + return CalcNewOffset(0); MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI; @@ -415,27 +420,38 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction()) PI = std::prev(PI); - unsigned Opc = PI->getOpcode(); - int Offset = 0; - - if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) && - PI->getOperand(0).getReg() == StackPtr) { - assert(PI->getOperand(1).getReg() == StackPtr); - Offset = PI->getOperand(2).getImm(); - } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) && - PI->getOperand(0).getReg() == StackPtr && - PI->getOperand(1).getReg() == StackPtr && - PI->getOperand(2).getImm() == 1 && - PI->getOperand(3).getReg() == X86::NoRegister && - PI->getOperand(5).getReg() == X86::NoRegister) { - // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg. - Offset = PI->getOperand(4).getImm(); - } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) && - PI->getOperand(0).getReg() == StackPtr) { - assert(PI->getOperand(1).getReg() == StackPtr); - Offset = -PI->getOperand(2).getImm(); - } else - return 0; + int64_t Offset = 0; + for (;;) { + unsigned Opc = PI->getOpcode(); + + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) && + PI->getOperand(0).getReg() == StackPtr) { + assert(PI->getOperand(1).getReg() == StackPtr); + Offset = PI->getOperand(2).getImm(); + } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) && + PI->getOperand(0).getReg() == StackPtr && + PI->getOperand(1).getReg() == StackPtr && + PI->getOperand(2).getImm() == 1 && + PI->getOperand(3).getReg() == X86::NoRegister && + PI->getOperand(5).getReg() == X86::NoRegister) { + // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg. + Offset = PI->getOperand(4).getImm(); + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) && + PI->getOperand(0).getReg() == StackPtr) { + assert(PI->getOperand(1).getReg() == StackPtr); + Offset = -PI->getOperand(2).getImm(); + } else + return CalcNewOffset(0); + + FoundStackAdjust(PI, Offset); + if (std::abs((int64_t)CalcNewOffset(Offset)) < MaxSPChunk) + break; + + if (doMergeWithPrevious ? (PI == MBB.begin()) : (PI == MBB.end())) + return CalcNewOffset(0); + + PI = doMergeWithPrevious ? std::prev(PI) : std::next(PI); + } PI = MBB.erase(PI); if (PI != MBB.end() && PI->isCFIInstruction()) { @@ -448,7 +464,16 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, if (!doMergeWithPrevious) MBBI = skipDebugInstructionsForward(PI, MBB.end()); - return Offset; + return CalcNewOffset(Offset); +} + +int64_t X86FrameLowering::mergeSPAdd(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + int64_t AddOffset, + bool doMergeWithPrevious) const { + return mergeSPUpdates( + MBB, MBBI, [AddOffset](int64_t Offset) { return AddOffset + Offset; }, + doMergeWithPrevious); } void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB, @@ -1975,8 +2000,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // If there is an SUB32ri of ESP immediately before this instruction, merge // the two. This can be the case when tail call elimination is enabled and - // the callee has more arguments then the caller. - NumBytes -= mergeSPUpdates(MBB, MBBI, true); + // the callee has more arguments than the caller. + NumBytes = mergeSPUpdates( + MBB, MBBI, [NumBytes](int64_t Offset) { return NumBytes - Offset; }, + true); // Adjust stack pointer: ESP -= numbytes. @@ -2457,7 +2484,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (HasFP) { if (X86FI->hasSwiftAsyncContext()) { // Discard the context. - int Offset = 16 + mergeSPUpdates(MBB, MBBI, true); + int64_t Offset = mergeSPAdd(MBB, MBBI, 16, true); emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true); } // Pop EBP. @@ -2531,7 +2558,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // If there is an ADD32ri or SUB32ri of ESP immediately before this // instruction, merge the two instructions. if (NumBytes || MFI.hasVarSizedObjects()) - NumBytes += mergeSPUpdates(MBB, MBBI, true); + NumBytes = mergeSPAdd(MBB, MBBI, NumBytes, true); // If dynamic alloca is used, then reset esp to point to the last callee-saved // slot before popping them off! Same applies for the case, when stack was @@ -2618,11 +2645,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) { // Add the return addr area delta back since we are not tail calling. - int Offset = -1 * X86FI->getTCReturnAddrDelta(); + int64_t Offset = -1 * X86FI->getTCReturnAddrDelta(); assert(Offset >= 0 && "TCDelta should never be positive"); if (Offset) { // Check for possible merge with preceding ADD instruction. - Offset += mergeSPUpdates(MBB, Terminator, true); + Offset = mergeSPAdd(MBB, Terminator, Offset, true); emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true); } } @@ -3820,13 +3847,24 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr( // Add Amount to SP to destroy a frame, or subtract to setup. int64_t StackAdjustment = isDestroy ? Amount : -Amount; + int64_t CfaAdjustment = StackAdjustment; if (StackAdjustment) { // Merge with any previous or following adjustment instruction. Note: the // instructions merged with here do not have CFI, so their stack - // adjustments do not feed into CfaAdjustment. - StackAdjustment += mergeSPUpdates(MBB, InsertPos, true); - StackAdjustment += mergeSPUpdates(MBB, InsertPos, false); + // adjustments do not feed into CfaAdjustment + + auto CalcCfaAdjust = [&CfaAdjustment](MachineBasicBlock::iterator PI, + int64_t Offset) { + CfaAdjustment += Offset; + }; + auto CalcNewOffset = [&StackAdjustment](int64_t Offset) { + return StackAdjustment + Offset; + }; + StackAdjustment = + mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, true); + StackAdjustment = + mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, false); if (StackAdjustment) { if (!(F.hasMinSize() && @@ -3836,7 +3874,7 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr( } } - if (DwarfCFI && !hasFP(MF)) { + if (DwarfCFI && !hasFP(MF) && CfaAdjustment) { // If we don't have FP, but need to generate unwind information, // we need to set the correct CFA offset after the stack adjustment. // How much we adjust the CFA offset depends on whether we're emitting @@ -3844,14 +3882,11 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr( // offset to be correct at each call site, while for debugging we want // it to be more precise. - int64_t CfaAdjustment = -StackAdjustment; // TODO: When not using precise CFA, we also need to adjust for the // InternalAmt here. - if (CfaAdjustment) { - BuildCFI( - MBB, InsertPos, DL, - MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment)); - } + BuildCFI( + MBB, InsertPos, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, -CfaAdjustment)); } return I; diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h index 02fe8ee02a7e4..ef41b4653becc 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -134,12 +134,50 @@ class X86FrameLowering : public TargetFrameLowering { processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, RegScavenger *RS) const override; - /// Check the instruction before/after the passed instruction. If - /// it is an ADD/SUB/LEA instruction it is deleted argument and the - /// stack adjustment is returned as a positive value for ADD/LEA and - /// a negative for SUB. - int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - bool doMergeWithPrevious) const; +private: + /// Basic Pseudocode: + /// if (instruction before/after the passed instruction is ADD/SUB/LEA) + /// Offset = instruction stack adjustment + /// ... positive value for ADD/LEA and negative for SUB + /// FoundStackAdjust(instruction, Offset) + /// erase(instruction) + /// return CalcNewOffset(Offset) + /// else + /// return CalcNewOffset(0) + /// + /// It's possible that the selected instruction is not immediately + /// before/after MBBI for large adjustments that have been split into multiple + /// instructions. + /// + /// FoundStackAdjust should have the signature: + /// void FoundStackAdjust(MachineBasicBlock::iterator PI, int64_t Offset) + /// CalcNewOffset should have the signature: + /// int64_t CalcNewOffset(int64_t Offset) + template + int64_t mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + FoundT FoundStackAdjust, CalcT CalcNewOffset, + bool doMergeWithPrevious) const; + + template + int64_t mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, CalcT CalcNewOffset, + bool doMergeWithPrevious) const { + auto FoundStackAdjust = [](MachineBasicBlock::iterator MBBI, + int64_t Offset) {}; + return mergeSPUpdates(MBB, MBBI, FoundStackAdjust, CalcNewOffset, + doMergeWithPrevious); + } + +public: + /// Equivalent to: + /// mergeSPUpdates(MBB, MBBI, + /// [AddOffset](int64_t Offset) { + /// return AddOffset + Offset; + /// }, + /// doMergeWithPrevious); + int64_t mergeSPAdd(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + int64_t AddOffset, bool doMergeWithPrevious) const; /// Emit a series of instructions to increment / decrement the stack /// pointer by a constant value. diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 0a605dfd017cb..1ca1079f806fc 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -378,6 +378,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["prng-inst"] = true; Features["wavefrontsize32"] = true; Features["wavefrontsize64"] = true; + Features["vmem-to-lds-load-insts"] = true; } else if (T.isAMDGCN()) { AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU); switch (Kind) { @@ -463,6 +464,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["s-memrealtime"] = true; Features["s-memtime-inst"] = true; Features["gws"] = true; + Features["vmem-to-lds-load-insts"] = true; break; case GK_GFX1012: case GK_GFX1011: @@ -487,6 +489,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["s-memrealtime"] = true; Features["s-memtime-inst"] = true; Features["gws"] = true; + Features["vmem-to-lds-load-insts"] = true; break; case GK_GFX950: Features["bitop3-insts"] = true; @@ -539,6 +542,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["ci-insts"] = true; Features["s-memtime-inst"] = true; Features["gws"] = true; + Features["vmem-to-lds-load-insts"] = true; break; case GK_GFX90A: Features["gfx90a-insts"] = true; @@ -591,6 +595,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["image-insts"] = true; Features["s-memtime-inst"] = true; Features["gws"] = true; + Features["vmem-to-lds-load-insts"] = true; break; case GK_NONE: break; diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp index 40164a34f08ac..257084d9c9797 100644 --- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp +++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp @@ -37,6 +37,16 @@ // memory that ends up in one of the runtime equivalents, since this can // happen if e.g. a library that was compiled without interposition returns // an allocation that can be validly passed to `free`. +// +// 3. MathFixup (required): Some accelerators might have an incomplete +// implementation for the intrinsics used to implement some of the math +// functions in / their corresponding libcall lowerings. Since this +// can vary quite significantly between accelerators, we replace calls to a +// set of intrinsics / lib functions known to be problematic with calls to a +// HIPSTDPAR specific forwarding layer, which gives an uniform interface for +// accelerators to implement in their own runtime components. This pass +// should run before AcceleratorCodeSelection so as to prevent the spurious +// removal of the HIPSTDPAR specific forwarding functions. //===----------------------------------------------------------------------===// #include "llvm/Transforms/HipStdPar/HipStdPar.h" @@ -48,6 +58,7 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -231,45 +242,55 @@ PreservedAnalyses } static constexpr std::pair ReplaceMap[]{ - {"aligned_alloc", "__hipstdpar_aligned_alloc"}, - {"calloc", "__hipstdpar_calloc"}, - {"free", "__hipstdpar_free"}, - {"malloc", "__hipstdpar_malloc"}, - {"memalign", "__hipstdpar_aligned_alloc"}, - {"posix_memalign", "__hipstdpar_posix_aligned_alloc"}, - {"realloc", "__hipstdpar_realloc"}, - {"reallocarray", "__hipstdpar_realloc_array"}, - {"_ZdaPv", "__hipstdpar_operator_delete"}, - {"_ZdaPvm", "__hipstdpar_operator_delete_sized"}, - {"_ZdaPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"}, - {"_ZdaPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"}, - {"_ZdlPv", "__hipstdpar_operator_delete"}, - {"_ZdlPvm", "__hipstdpar_operator_delete_sized"}, - {"_ZdlPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"}, - {"_ZdlPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"}, - {"_Znam", "__hipstdpar_operator_new"}, - {"_ZnamRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"}, - {"_ZnamSt11align_val_t", "__hipstdpar_operator_new_aligned"}, - {"_ZnamSt11align_val_tRKSt9nothrow_t", - "__hipstdpar_operator_new_aligned_nothrow"}, - - {"_Znwm", "__hipstdpar_operator_new"}, - {"_ZnwmRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"}, - {"_ZnwmSt11align_val_t", "__hipstdpar_operator_new_aligned"}, - {"_ZnwmSt11align_val_tRKSt9nothrow_t", - "__hipstdpar_operator_new_aligned_nothrow"}, - {"__builtin_calloc", "__hipstdpar_calloc"}, - {"__builtin_free", "__hipstdpar_free"}, - {"__builtin_malloc", "__hipstdpar_malloc"}, - {"__builtin_operator_delete", "__hipstdpar_operator_delete"}, - {"__builtin_operator_new", "__hipstdpar_operator_new"}, - {"__builtin_realloc", "__hipstdpar_realloc"}, - {"__libc_calloc", "__hipstdpar_calloc"}, - {"__libc_free", "__hipstdpar_free"}, - {"__libc_malloc", "__hipstdpar_malloc"}, - {"__libc_memalign", "__hipstdpar_aligned_alloc"}, - {"__libc_realloc", "__hipstdpar_realloc"} -}; + {"aligned_alloc", "__hipstdpar_aligned_alloc"}, + {"calloc", "__hipstdpar_calloc"}, + {"free", "__hipstdpar_free"}, + {"malloc", "__hipstdpar_malloc"}, + {"memalign", "__hipstdpar_aligned_alloc"}, + {"mmap", "__hipstdpar_mmap"}, + {"munmap", "__hipstdpar_munmap"}, + {"posix_memalign", "__hipstdpar_posix_aligned_alloc"}, + {"realloc", "__hipstdpar_realloc"}, + {"reallocarray", "__hipstdpar_realloc_array"}, + {"_ZdaPv", "__hipstdpar_operator_delete"}, + {"_ZdaPvm", "__hipstdpar_operator_delete_sized"}, + {"_ZdaPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"}, + {"_ZdaPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"}, + {"_ZdlPv", "__hipstdpar_operator_delete"}, + {"_ZdlPvm", "__hipstdpar_operator_delete_sized"}, + {"_ZdlPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"}, + {"_ZdlPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"}, + {"_Znam", "__hipstdpar_operator_new"}, + {"_ZnamRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"}, + {"_ZnamSt11align_val_t", "__hipstdpar_operator_new_aligned"}, + {"_ZnamSt11align_val_tRKSt9nothrow_t", + "__hipstdpar_operator_new_aligned_nothrow"}, + + {"_Znwm", "__hipstdpar_operator_new"}, + {"_ZnwmRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"}, + {"_ZnwmSt11align_val_t", "__hipstdpar_operator_new_aligned"}, + {"_ZnwmSt11align_val_tRKSt9nothrow_t", + "__hipstdpar_operator_new_aligned_nothrow"}, + {"__builtin_calloc", "__hipstdpar_calloc"}, + {"__builtin_free", "__hipstdpar_free"}, + {"__builtin_malloc", "__hipstdpar_malloc"}, + {"__builtin_operator_delete", "__hipstdpar_operator_delete"}, + {"__builtin_operator_new", "__hipstdpar_operator_new"}, + {"__builtin_realloc", "__hipstdpar_realloc"}, + {"__libc_calloc", "__hipstdpar_calloc"}, + {"__libc_free", "__hipstdpar_free"}, + {"__libc_malloc", "__hipstdpar_malloc"}, + {"__libc_memalign", "__hipstdpar_aligned_alloc"}, + {"__libc_realloc", "__hipstdpar_realloc"}}; + +static constexpr std::pair HiddenMap[]{ + // hidden_malloc and hidden_free are only kept for backwards compatibility / + // legacy purposes, and we should remove them in the future + {"__hipstdpar_hidden_malloc", "__libc_malloc"}, + {"__hipstdpar_hidden_free", "__libc_free"}, + {"__hipstdpar_hidden_memalign", "__libc_memalign"}, + {"__hipstdpar_hidden_mmap", "mmap"}, + {"__hipstdpar_hidden_munmap", "munmap"}}; PreservedAnalyses HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) { @@ -299,20 +320,121 @@ HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) { } } - if (auto F = M.getFunction("__hipstdpar_hidden_malloc")) { - auto LibcMalloc = M.getOrInsertFunction( - "__libc_malloc", F->getFunctionType(), F->getAttributes()); - F->replaceAllUsesWith(LibcMalloc.getCallee()); + for (auto &&HR : HiddenMap) { + if (auto F = M.getFunction(HR.first)) { + auto R = M.getOrInsertFunction(HR.second, F->getFunctionType(), + F->getAttributes()); + F->replaceAllUsesWith(R.getCallee()); - eraseFromModule(*F); + eraseFromModule(*F); + } } - if (auto F = M.getFunction("__hipstdpar_hidden_free")) { - auto LibcFree = M.getOrInsertFunction("__libc_free", F->getFunctionType(), - F->getAttributes()); - F->replaceAllUsesWith(LibcFree.getCallee()); - eraseFromModule(*F); + return PreservedAnalyses::none(); +} + +static constexpr std::pair MathLibToHipStdPar[]{ + {"acosh", "__hipstdpar_acosh_f64"}, + {"acoshf", "__hipstdpar_acosh_f32"}, + {"asinh", "__hipstdpar_asinh_f64"}, + {"asinhf", "__hipstdpar_asinh_f32"}, + {"atanh", "__hipstdpar_atanh_f64"}, + {"atanhf", "__hipstdpar_atanh_f32"}, + {"cbrt", "__hipstdpar_cbrt_f64"}, + {"cbrtf", "__hipstdpar_cbrt_f32"}, + {"erf", "__hipstdpar_erf_f64"}, + {"erff", "__hipstdpar_erf_f32"}, + {"erfc", "__hipstdpar_erfc_f64"}, + {"erfcf", "__hipstdpar_erfc_f32"}, + {"fdim", "__hipstdpar_fdim_f64"}, + {"fdimf", "__hipstdpar_fdim_f32"}, + {"expm1", "__hipstdpar_expm1_f64"}, + {"expm1f", "__hipstdpar_expm1_f32"}, + {"hypot", "__hipstdpar_hypot_f64"}, + {"hypotf", "__hipstdpar_hypot_f32"}, + {"ilogb", "__hipstdpar_ilogb_f64"}, + {"ilogbf", "__hipstdpar_ilogb_f32"}, + {"lgamma", "__hipstdpar_lgamma_f64"}, + {"lgammaf", "__hipstdpar_lgamma_f32"}, + {"log1p", "__hipstdpar_log1p_f64"}, + {"log1pf", "__hipstdpar_log1p_f32"}, + {"logb", "__hipstdpar_logb_f64"}, + {"logbf", "__hipstdpar_logb_f32"}, + {"nextafter", "__hipstdpar_nextafter_f64"}, + {"nextafterf", "__hipstdpar_nextafter_f32"}, + {"nexttoward", "__hipstdpar_nexttoward_f64"}, + {"nexttowardf", "__hipstdpar_nexttoward_f32"}, + {"remainder", "__hipstdpar_remainder_f64"}, + {"remainderf", "__hipstdpar_remainder_f32"}, + {"remquo", "__hipstdpar_remquo_f64"}, + {"remquof", "__hipstdpar_remquo_f32"}, + {"scalbln", "__hipstdpar_scalbln_f64"}, + {"scalblnf", "__hipstdpar_scalbln_f32"}, + {"scalbn", "__hipstdpar_scalbn_f64"}, + {"scalbnf", "__hipstdpar_scalbn_f32"}, + {"tgamma", "__hipstdpar_tgamma_f64"}, + {"tgammaf", "__hipstdpar_tgamma_f32"}}; + +PreservedAnalyses HipStdParMathFixupPass::run(Module &M, + ModuleAnalysisManager &) { + if (M.empty()) + return PreservedAnalyses::all(); + + SmallVector> ToReplace; + for (auto &&F : M) { + if (!F.hasName()) + continue; + + StringRef N = F.getName(); + Intrinsic::ID ID = F.getIntrinsicID(); + + switch (ID) { + case Intrinsic::not_intrinsic: { + auto It = + find_if(MathLibToHipStdPar, [&](auto &&M) { return M.first == N; }); + if (It == std::cend(MathLibToHipStdPar)) + continue; + ToReplace.emplace_back(&F, It->second); + break; + } + case Intrinsic::acos: + case Intrinsic::asin: + case Intrinsic::atan: + case Intrinsic::atan2: + case Intrinsic::cosh: + case Intrinsic::sinh: + case Intrinsic::tan: + case Intrinsic::tanh: + break; + default: { + if (F.getReturnType()->isDoubleTy()) { + switch (ID) { + case Intrinsic::cos: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::pow: + case Intrinsic::sin: + break; + default: + continue; + } + break; + } + continue; + } + } + + ToReplace.emplace_back(&F, N); + llvm::replace(ToReplace.back().second, '.', '_'); + StringRef Prefix = "llvm"; + ToReplace.back().second.replace(0, Prefix.size(), "__hipstdpar"); } + for (auto &&[F, NewF] : ToReplace) + F->replaceAllUsesWith( + M.getOrInsertFunction(NewF, F->getFunctionType()).getCallee()); return PreservedAnalyses::none(); } diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index a93284926d684..333a7fc1a6dca 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -259,11 +259,15 @@ AA::getInitialValueForObj(Attributor &A, const AbstractAttribute &QueryingAA, if (!Initializer) return nullptr; } else { - if (!GV->hasLocalLinkage() && - (GV->isInterposable() || !(GV->isConstant() && GV->hasInitializer()))) - return nullptr; - if (!GV->hasInitializer()) - return UndefValue::get(&Ty); + if (!GV->hasLocalLinkage()) { + // Externally visible global that's either non-constant, + // or a constant with an uncertain initializer. + if (!GV->hasDefinitiveInitializer() || !GV->isConstant()) + return nullptr; + } + + // Globals with local linkage are always initialized. + assert(!GV->hasLocalLinkage() || GV->hasInitializer()); if (!Initializer) Initializer = GV->getInitializer(); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index ca8a20b4b7312..de1007543cd50 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -11,9 +11,12 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" @@ -3546,6 +3549,154 @@ static Value *foldOrOfInversions(BinaryOperator &I, return nullptr; } +/// Match \p V as "shufflevector -> bitcast" or "extractelement -> zext -> shl" +/// patterns, which extract vector elements and pack them in the same relative +/// positions. +/// +/// \p Vec is the underlying vector being extracted from. +/// \p Mask is a bitmask identifying which packed elements are obtained from the +/// vector. +/// \p VecOffset is the vector element corresponding to index 0 of the +/// mask. +static bool matchSubIntegerPackFromVector(Value *V, Value *&Vec, + int64_t &VecOffset, + SmallBitVector &Mask, + const DataLayout &DL) { + static const auto m_ConstShlOrSelf = [](const auto &Base, uint64_t &ShlAmt) { + ShlAmt = 0; + return m_CombineOr(m_Shl(Base, m_ConstantInt(ShlAmt)), Base); + }; + + // First try to match extractelement -> zext -> shl + uint64_t VecIdx, ShlAmt; + if (match(V, m_ConstShlOrSelf(m_ZExtOrSelf(m_ExtractElt( + m_Value(Vec), m_ConstantInt(VecIdx))), + ShlAmt))) { + auto *VecTy = dyn_cast(Vec->getType()); + if (!VecTy) + return false; + auto *EltTy = dyn_cast(VecTy->getElementType()); + if (!EltTy) + return false; + + const unsigned EltBitWidth = EltTy->getBitWidth(); + const unsigned TargetBitWidth = V->getType()->getIntegerBitWidth(); + if (TargetBitWidth % EltBitWidth != 0 || ShlAmt % EltBitWidth != 0) + return false; + const unsigned TargetEltWidth = TargetBitWidth / EltBitWidth; + const unsigned ShlEltAmt = ShlAmt / EltBitWidth; + + const unsigned MaskIdx = + DL.isLittleEndian() ? ShlEltAmt : TargetEltWidth - ShlEltAmt - 1; + + VecOffset = static_cast(VecIdx) - static_cast(MaskIdx); + Mask.resize(TargetEltWidth); + Mask.set(MaskIdx); + return true; + } + + // Now try to match a bitcasted subvector. + Instruction *SrcVecI; + if (!match(V, m_BitCast(m_Instruction(SrcVecI)))) + return false; + + auto *SrcTy = dyn_cast(SrcVecI->getType()); + if (!SrcTy) + return false; + + Mask.resize(SrcTy->getNumElements()); + + // First check for a subvector obtained from a shufflevector. + if (isa(SrcVecI)) { + Constant *ConstVec; + ArrayRef ShuffleMask; + if (!match(SrcVecI, m_Shuffle(m_Value(Vec), m_Constant(ConstVec), + m_Mask(ShuffleMask)))) + return false; + + auto *VecTy = dyn_cast(Vec->getType()); + if (!VecTy) + return false; + + const unsigned NumVecElts = VecTy->getNumElements(); + bool FoundVecOffset = false; + for (unsigned Idx = 0; Idx < ShuffleMask.size(); ++Idx) { + if (ShuffleMask[Idx] == PoisonMaskElem) + return false; + const unsigned ShuffleIdx = ShuffleMask[Idx]; + if (ShuffleIdx >= NumVecElts) { + const unsigned ConstIdx = ShuffleIdx - NumVecElts; + auto *ConstElt = + dyn_cast(ConstVec->getAggregateElement(ConstIdx)); + if (!ConstElt || !ConstElt->isNullValue()) + return false; + continue; + } + + if (FoundVecOffset) { + if (VecOffset + Idx != ShuffleIdx) + return false; + } else { + if (ShuffleIdx < Idx) + return false; + VecOffset = ShuffleIdx - Idx; + FoundVecOffset = true; + } + Mask.set(Idx); + } + return FoundVecOffset; + } + + // Check for a subvector obtained as an (insertelement V, 0, idx) + uint64_t InsertIdx; + if (!match(SrcVecI, + m_InsertElt(m_Value(Vec), m_Zero(), m_ConstantInt(InsertIdx)))) + return false; + + auto *VecTy = dyn_cast(Vec->getType()); + if (!VecTy) + return false; + VecOffset = 0; + bool AlreadyInsertedMaskedElt = Mask.test(InsertIdx); + Mask.set(); + if (!AlreadyInsertedMaskedElt) + Mask.reset(InsertIdx); + return true; +} + +/// Try to fold the join of two scalar integers whose contents are packed +/// elements of the same vector. +static Instruction *foldIntegerPackFromVector(Instruction &I, + InstCombiner::BuilderTy &Builder, + const DataLayout &DL) { + assert(I.getOpcode() == Instruction::Or); + Value *LhsVec, *RhsVec; + int64_t LhsVecOffset, RhsVecOffset; + SmallBitVector Mask; + if (!matchSubIntegerPackFromVector(I.getOperand(0), LhsVec, LhsVecOffset, + Mask, DL)) + return nullptr; + if (!matchSubIntegerPackFromVector(I.getOperand(1), RhsVec, RhsVecOffset, + Mask, DL)) + return nullptr; + if (LhsVec != RhsVec || LhsVecOffset != RhsVecOffset) + return nullptr; + + // Convert into shufflevector -> bitcast; + const unsigned ZeroVecIdx = + cast(LhsVec->getType())->getNumElements(); + SmallVector ShuffleMask(Mask.size(), ZeroVecIdx); + for (unsigned Idx : Mask.set_bits()) { + assert(LhsVecOffset + Idx >= 0); + ShuffleMask[Idx] = LhsVecOffset + Idx; + } + + Value *MaskedVec = Builder.CreateShuffleVector( + LhsVec, Constant::getNullValue(LhsVec->getType()), ShuffleMask, + I.getName() + ".v"); + return CastInst::Create(Instruction::BitCast, MaskedVec, I.getType()); +} + // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. @@ -3575,6 +3726,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Instruction *X = foldComplexAndOrPatterns(I, Builder)) return X; + if (Instruction *X = foldIntegerPackFromVector(I, Builder, DL)) + return X; + // (A & B) | (C & D) -> A ^ D where A == ~C && B == ~D // (A & B) | (C & D) -> A ^ C where A == ~D && B == ~C if (Value *V = foldOrOfInversions(I, Builder)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 6fe6ffa284fc6..4ec1af394464b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2743,7 +2743,7 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) { if (DestTy == Src->getType()) return replaceInstUsesWith(CI, Src); - if (FixedVectorType *DestVTy = dyn_cast(DestTy)) { + if (isa(DestTy)) { if (isa(SrcTy)) { // If this is a cast from an integer to vector, check to see if the input // is a trunc or zext of a bitcast from vector. If so, we can replace all diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 4b42e86e25161..04a5f0c05c807 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -244,11 +244,10 @@ class PointerReplacer { void replacePointer(Value *V); private: - bool collectUsersRecursive(Instruction &I); void replace(Instruction *I); - Value *getReplacement(Value *I); + Value *getReplacement(Value *V) const { return WorkMap.lookup(V); } bool isAvailable(Instruction *I) const { - return I == &Root || Worklist.contains(I); + return I == &Root || UsersToReplace.contains(I); } bool isEqualOrValidAddrSpaceCast(const Instruction *I, @@ -260,8 +259,7 @@ class PointerReplacer { return (FromAS == ToAS) || IC.isValidAddrSpaceCast(FromAS, ToAS); } - SmallPtrSet ValuesToRevisit; - SmallSetVector Worklist; + SmallSetVector UsersToReplace; MapVector WorkMap; InstCombinerImpl &IC; Instruction &Root; @@ -270,72 +268,93 @@ class PointerReplacer { } // end anonymous namespace bool PointerReplacer::collectUsers() { - if (!collectUsersRecursive(Root)) - return false; + SmallVector Worklist; + SmallSetVector ValuesToRevisit; + + auto PushUsersToWorklist = [&](Instruction *Inst) { + for (auto *U : Inst->users()) + if (auto *I = dyn_cast(U)) + if (!isAvailable(I) && !ValuesToRevisit.contains(I)) + Worklist.emplace_back(I); + }; - // Ensure that all outstanding (indirect) users of I - // are inserted into the Worklist. Return false - // otherwise. - return llvm::set_is_subset(ValuesToRevisit, Worklist); -} + auto TryPushInstOperand = [&](Instruction *InstOp) { + if (!UsersToReplace.contains(InstOp)) { + if (!ValuesToRevisit.insert(InstOp)) + return false; + Worklist.emplace_back(InstOp); + } + return true; + }; -bool PointerReplacer::collectUsersRecursive(Instruction &I) { - for (auto *U : I.users()) { - auto *Inst = cast(&*U); + PushUsersToWorklist(&Root); + while (!Worklist.empty()) { + Instruction *Inst = Worklist.pop_back_val(); if (auto *Load = dyn_cast(Inst)) { if (Load->isVolatile()) return false; - Worklist.insert(Load); + UsersToReplace.insert(Load); } else if (auto *PHI = dyn_cast(Inst)) { - // All incoming values must be instructions for replacability - if (any_of(PHI->incoming_values(), - [](Value *V) { return !isa(V); })) - return false; - - // If at least one incoming value of the PHI is not in Worklist, - // store the PHI for revisiting and skip this iteration of the - // loop. - if (any_of(PHI->incoming_values(), [this](Value *V) { - return !isAvailable(cast(V)); + /// TODO: Handle poison and null pointers for PHI and select. + // If all incoming values are available, mark this PHI as + // replacable and push it's users into the worklist. + bool IsReplaceable = true; + if (all_of(PHI->incoming_values(), [&](Value *V) { + if (!isa(V)) + return IsReplaceable = false; + return isAvailable(cast(V)); })) { - ValuesToRevisit.insert(Inst); + UsersToReplace.insert(PHI); + PushUsersToWorklist(PHI); continue; } - Worklist.insert(PHI); - if (!collectUsersRecursive(*PHI)) + // Either an incoming value is not an instruction or not all + // incoming values are available. If this PHI was already + // visited prior to this iteration, return false. + if (!IsReplaceable || !ValuesToRevisit.insert(PHI)) return false; + + // Push PHI back into the stack, followed by unavailable + // incoming values. + Worklist.emplace_back(PHI); + for (unsigned Idx = 0; Idx < PHI->getNumIncomingValues(); ++Idx) { + if (!TryPushInstOperand(cast(PHI->getIncomingValue(Idx)))) + return false; + } } else if (auto *SI = dyn_cast(Inst)) { - if (!isa(SI->getTrueValue()) || - !isa(SI->getFalseValue())) + auto *TrueInst = dyn_cast(SI->getTrueValue()); + auto *FalseInst = dyn_cast(SI->getFalseValue()); + if (!TrueInst || !FalseInst) return false; - if (!isAvailable(cast(SI->getTrueValue())) || - !isAvailable(cast(SI->getFalseValue()))) { - ValuesToRevisit.insert(Inst); + if (isAvailable(TrueInst) && isAvailable(FalseInst)) { + UsersToReplace.insert(SI); + PushUsersToWorklist(SI); continue; } - Worklist.insert(SI); - if (!collectUsersRecursive(*SI)) - return false; - } else if (isa(Inst)) { - Worklist.insert(Inst); - if (!collectUsersRecursive(*Inst)) + + // Push select back onto the stack, followed by unavailable true/false + // value. + Worklist.emplace_back(SI); + if (!TryPushInstOperand(TrueInst) || !TryPushInstOperand(FalseInst)) return false; + } else if (auto *GEP = dyn_cast(Inst)) { + UsersToReplace.insert(GEP); + PushUsersToWorklist(GEP); } else if (auto *MI = dyn_cast(Inst)) { if (MI->isVolatile()) return false; - Worklist.insert(Inst); + UsersToReplace.insert(Inst); } else if (isEqualOrValidAddrSpaceCast(Inst, FromAS)) { - Worklist.insert(Inst); - if (!collectUsersRecursive(*Inst)) - return false; + UsersToReplace.insert(Inst); + PushUsersToWorklist(Inst); } else if (Inst->isLifetimeStartOrEnd()) { continue; } else { // TODO: For arbitrary uses with address space mismatches, should we check // if we can introduce a valid addrspacecast? - LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n'); + LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *Inst << '\n'); return false; } } @@ -343,7 +362,39 @@ bool PointerReplacer::collectUsersRecursive(Instruction &I) { return true; } -Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); } +void PointerReplacer::replacePointer(Value *V) { + assert(cast(Root.getType()) != cast(V->getType()) && + "Invalid usage"); + WorkMap[&Root] = V; + SmallVector Worklist; + SetVector PostOrderWorklist; + SmallPtrSet Visited; + + // Perform a postorder traversal of the users of Root. + Worklist.push_back(&Root); + while (!Worklist.empty()) { + Instruction *I = Worklist.back(); + + // If I has not been processed before, push each of its + // replacable users into the worklist. + if (Visited.insert(I).second) { + for (auto *U : I->users()) { + auto *UserInst = cast(U); + if (UsersToReplace.contains(UserInst) && !Visited.contains(UserInst)) + Worklist.push_back(UserInst); + } + // Otherwise, users of I have already been pushed into + // the PostOrderWorklist. Push I as well. + } else { + PostOrderWorklist.insert(I); + Worklist.pop_back(); + } + } + + // Replace pointers in reverse-postorder. + for (Instruction *I : reverse(PostOrderWorklist)) + replace(I); +} void PointerReplacer::replace(Instruction *I) { if (getReplacement(I)) @@ -360,15 +411,20 @@ void PointerReplacer::replace(Instruction *I) { IC.InsertNewInstWith(NewI, LT->getIterator()); IC.replaceInstUsesWith(*LT, NewI); - WorkMap[LT] = NewI; + // LT has actually been replaced by NewI. It is useless to insert LT into + // the map. Instead, we insert NewI into the map to indicate this is the + // replacement (new value). + WorkMap[NewI] = NewI; } else if (auto *PHI = dyn_cast(I)) { - Type *NewTy = getReplacement(PHI->getIncomingValue(0))->getType(); - auto *NewPHI = PHINode::Create(NewTy, PHI->getNumIncomingValues(), - PHI->getName(), PHI->getIterator()); - for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I) - NewPHI->addIncoming(getReplacement(PHI->getIncomingValue(I)), - PHI->getIncomingBlock(I)); - WorkMap[PHI] = NewPHI; + // Create a new PHI by replacing any incoming value that is a user of the + // root pointer and has a replacement. + Value *V = WorkMap.lookup(PHI->getIncomingValue(0)); + PHI->mutateType(V ? V->getType() : PHI->getIncomingValue(0)->getType()); + for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I) { + Value *V = WorkMap.lookup(PHI->getIncomingValue(I)); + PHI->setIncomingValue(I, V ? V : PHI->getIncomingValue(I)); + } + WorkMap[PHI] = PHI; } else if (auto *GEP = dyn_cast(I)) { auto *V = getReplacement(GEP->getPointerOperand()); assert(V && "Operand not replaced"); @@ -432,18 +488,6 @@ void PointerReplacer::replace(Instruction *I) { } } -void PointerReplacer::replacePointer(Value *V) { -#ifndef NDEBUG - auto *PT = cast(Root.getType()); - auto *NT = cast(V->getType()); - assert(PT != NT && "Invalid usage"); -#endif - WorkMap[&Root] = V; - - for (Instruction *Workitem : Worklist) - replace(Workitem); -} - Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) { if (auto *I = simplifyAllocaArraySize(*this, AI, DT)) return I; diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 8a3e0bc3eb971..4a83eb9c69783 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1096,10 +1096,12 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { if (!Preheader) return false; bool MadeAnyChanges = false; - BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt(); - BasicBlock::iterator I(Preheader->getTerminator()); - while (I != Preheader->begin()) { - --I; + for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { + + // Skip BB Terminator. + if (Preheader->getTerminator() == &I) + continue; + // New instructions were inserted at the end of the preheader. if (isa(I)) break; @@ -1110,28 +1112,28 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { // memory. Note that it's okay if the instruction might have undefined // behavior: LoopSimplify guarantees that the preheader dominates the exit // block. - if (I->mayHaveSideEffects() || I->mayReadFromMemory()) + if (I.mayHaveSideEffects() || I.mayReadFromMemory()) continue; - // Skip debug info intrinsics. - if (isa(I)) + // Skip debug or pseudo instructions. + if (I.isDebugOrPseudoInst()) continue; // Skip eh pad instructions. - if (I->isEHPad()) + if (I.isEHPad()) continue; // Don't sink alloca: we never want to sink static alloca's out of the // entry block, and correctly sinking dynamic alloca's requires // checks for stacksave/stackrestore intrinsics. // FIXME: Refactor this check somehow? - if (isa(I)) + if (isa(&I)) continue; // Determine if there is a use in or before the loop (direct or // otherwise). bool UsedInLoop = false; - for (Use &U : I->uses()) { + for (Use &U : I.uses()) { Instruction *User = cast(U.getUser()); BasicBlock *UseBB = User->getParent(); if (PHINode *P = dyn_cast(User)) { @@ -1150,26 +1152,9 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { continue; // Otherwise, sink it to the exit block. - Instruction *ToMove = &*I; - bool Done = false; - - if (I != Preheader->begin()) { - // Skip debug info intrinsics. - do { - --I; - } while (I->isDebugOrPseudoInst() && I != Preheader->begin()); - - if (I->isDebugOrPseudoInst() && I == Preheader->begin()) - Done = true; - } else { - Done = true; - } - + I.moveBefore(ExitBlock->getFirstInsertionPt()); + SE->forgetValue(&I); MadeAnyChanges = true; - ToMove->moveBefore(*ExitBlock, InsertPt); - SE->forgetValue(ToMove); - if (Done) break; - InsertPt = ToMove->getIterator(); } return MadeAnyChanges; diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 322c8d6325a89..6ba2387d45b36 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -797,7 +797,7 @@ static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost, static std::optional shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, const unsigned TripMultiple, const unsigned TripCount, - const UnrollCostEstimator UCE, + unsigned MaxTripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP) { // Using unroll pragma @@ -827,6 +827,10 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, return TripCount; } + if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount && + MaxTripCount <= UnrollMaxUpperBound) + return MaxTripCount; + // if didn't return until here, should continue to other priorties return std::nullopt; } @@ -953,7 +957,7 @@ bool llvm::computeUnrollCount( // 1st priority is unroll count set by "unroll-count" option. // 2nd priority is unroll count set by pragma. if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount, - UCE, UP)) { + MaxTripCount, UCE, UP)) { UP.Count = *UnrollFactor; if (UserUnrollCount || (PragmaCount > 0)) { diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index b1f742b838f2a..3fb724f163d4b 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -129,8 +129,6 @@ using BBPredicates = DenseMap; using PredMap = DenseMap; using BB2BBMap = DenseMap; -using BranchDebugLocMap = DenseMap; - // A traits type that is intended to be used in graph algorithms. The graph // traits starts at an entry node, and traverses the RegionNodes that are in // the Nodes set. @@ -303,8 +301,6 @@ class StructurizeCFG { PredMap LoopPreds; BranchVector LoopConds; - BranchDebugLocMap TermDL; - RegionNode *PrevNode; void orderNodes(); @@ -336,14 +332,14 @@ class StructurizeCFG { void simplifyAffectedPhis(); - void killTerminator(BasicBlock *BB); + DebugLoc killTerminator(BasicBlock *BB); void changeExit(RegionNode *Node, BasicBlock *NewExit, bool IncludeDominator); BasicBlock *getNextFlow(BasicBlock *Dominator); - BasicBlock *needPrefix(bool NeedEmpty); + std::pair needPrefix(bool NeedEmpty); BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed); @@ -595,14 +591,14 @@ void StructurizeCFG::collectInfos() { // Find the last back edges analyzeLoops(RN); } - +/* // Reset the collected term debug locations TermDL.clear(); for (BasicBlock &BB : *Func) { if (const DebugLoc &DL = BB.getTerminator()->getDebugLoc()) TermDL[&BB] = DL; - } + } */ } /// Insert the missing branch conditions @@ -618,25 +614,28 @@ void StructurizeCFG::insertConditions(bool Loops) { BasicBlock *SuccTrue = Term->getSuccessor(0); BasicBlock *SuccFalse = Term->getSuccessor(1); - BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue]; + PhiInserter.Initialize(Boolean, ""); + PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default); - if (Preds.size() == 1 && Preds.begin()->first == Parent) { - auto &PI = Preds.begin()->second; - Term->setCondition(PI.Pred); - CondBranchWeights::setMetadata(*Term, PI.Weights); - } else { - PhiInserter.Initialize(Boolean, ""); - PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default); + BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue]; - NearestCommonDominator Dominator(DT); - Dominator.addBlock(Parent); + NearestCommonDominator Dominator(DT); + Dominator.addBlock(Parent); - for (auto [BB, PI] : Preds) { - assert(BB != Parent); - PhiInserter.AddAvailableValue(BB, PI.Pred); - Dominator.addAndRememberBlock(BB); + PredInfo ParentInfo{nullptr, std::nullopt}; + for (auto [BB, PI] : Preds) { + if (BB == Parent) { + ParentInfo = PI; + break; } + PhiInserter.AddAvailableValue(BB, PI.Pred); + Dominator.addAndRememberBlock(BB); + } + if (ParentInfo.Pred) { + Term->setCondition(ParentInfo.Pred); + CondBranchWeights::setMetadata(*Term, ParentInfo.Weights); + } else { if (!Dominator.resultIsRememberedBlock()) PhiInserter.AddAvailableValue(Dominator.result(), Default); @@ -858,10 +857,10 @@ void StructurizeCFG::setPhiValues() { PhiMap &Map = DeletedPhis[To]; SmallVector &UndefBlks = UndefBlksMap[To]; for (const auto &[Phi, Incoming] : Map) { - Value *Undef = UndefValue::get(Phi->getType()); + Value *Poison = PoisonValue::get(Phi->getType()); Updater.Initialize(Phi->getType(), ""); - Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); - Updater.AddAvailableValue(To, Undef); + Updater.AddAvailableValue(&Func->getEntryBlock(), Poison); + Updater.AddAvailableValue(To, Poison); // Use leader phi's incoming if there is. auto LeaderIt = PhiClasses.findLeader(Phi); @@ -890,7 +889,7 @@ void StructurizeCFG::setPhiValues() { if (Updater.HasValueForBlock(UB)) continue; - Updater.AddAvailableValue(UB, Undef); + Updater.AddAvailableValue(UB, Poison); } for (BasicBlock *FI : From) @@ -924,15 +923,17 @@ void StructurizeCFG::simplifyAffectedPhis() { } /// Remove phi values from all successors and then remove the terminator. -void StructurizeCFG::killTerminator(BasicBlock *BB) { +DebugLoc StructurizeCFG::killTerminator(BasicBlock *BB) { Instruction *Term = BB->getTerminator(); if (!Term) - return; + return DebugLoc(); for (BasicBlock *Succ : successors(BB)) delPhiValues(BB, Succ); + DebugLoc DL = Term->getDebugLoc(); Term->eraseFromParent(); + return DL; } /// Let node exit(s) point to NewExit @@ -971,9 +972,9 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit, SubRegion->replaceExit(NewExit); } else { BasicBlock *BB = Node->getNodeAs(); - killTerminator(BB); + DebugLoc DL = killTerminator(BB); BranchInst *Br = BranchInst::Create(NewExit, BB); - Br->setDebugLoc(TermDL[BB]); + Br->setDebugLoc(DL); addPhiValues(BB, NewExit); if (IncludeDominator) DT->changeImmediateDominator(NewExit, BB); @@ -988,25 +989,20 @@ BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) { BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName, Func, Insert); FlowSet.insert(Flow); - - // use a temporary variable to avoid a use-after-free if the map's storage is - // reallocated - DebugLoc DL = TermDL[Dominator]; - TermDL[Flow] = std::move(DL); - DT->addNewBlock(Flow, Dominator); ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion); return Flow; } -/// Create a new or reuse the previous node as flow node -BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) { +/// Create a new or reuse the previous node as flow node. Returns a block and a +/// debug location to be used for new instructions in that block. +std::pair StructurizeCFG::needPrefix(bool NeedEmpty) { BasicBlock *Entry = PrevNode->getEntry(); if (!PrevNode->isSubRegion()) { - killTerminator(Entry); + DebugLoc DL = killTerminator(Entry); if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end()) - return Entry; + return {Entry, DL}; } // create a new flow node @@ -1015,7 +1011,7 @@ BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) { // and wire it up changeExit(PrevNode, Flow, true); PrevNode = ParentRegion->getBBNode(Flow); - return Flow; + return {Flow, DebugLoc()}; } /// Returns the region exit if possible, otherwise just a new flow node @@ -1079,7 +1075,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed, PrevNode = Node; } else { // Insert extra prefix node (or reuse last one) - BasicBlock *Flow = needPrefix(false); + auto [Flow, DL] = needPrefix(false); // Insert extra postfix node (or use exit instead) BasicBlock *Entry = Node->getEntry(); @@ -1087,7 +1083,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed, // let it point to entry and next block BranchInst *Br = BranchInst::Create(Entry, Next, BoolPoison, Flow); - Br->setDebugLoc(TermDL[Flow]); + Br->setDebugLoc(DL); Conditions.push_back(Br); addPhiValues(Flow, Entry); DT->changeImmediateDominator(Entry, Flow); @@ -1114,7 +1110,7 @@ void StructurizeCFG::handleLoops(bool ExitUseAllowed, } if (!isPredictableTrue(Node)) - LoopStart = needPrefix(true); + LoopStart = needPrefix(true).first; LoopEnd = Loops[Node->getEntry()]; wireFlow(false, LoopEnd); @@ -1125,10 +1121,11 @@ void StructurizeCFG::handleLoops(bool ExitUseAllowed, assert(LoopStart != &LoopStart->getParent()->getEntryBlock()); // Create an extra loop end node - LoopEnd = needPrefix(false); + DebugLoc DL; + std::tie(LoopEnd, DL) = needPrefix(false); BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed); BranchInst *Br = BranchInst::Create(Next, LoopStart, BoolPoison, LoopEnd); - Br->setDebugLoc(TermDL[LoopEnd]); + Br->setDebugLoc(DL); LoopConds.push_back(Br); addPhiValues(LoopEnd, LoopStart); setPrevNode(Next); @@ -1181,9 +1178,9 @@ void StructurizeCFG::rebuildSSA() { continue; if (!Initialized) { - Value *Undef = UndefValue::get(I.getType()); + Value *Poison = PoisonValue::get(I.getType()); Updater.Initialize(I.getType(), ""); - Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); + Updater.AddAvailableValue(&Func->getEntryBlock(), Poison); Updater.AddAvailableValue(BB, &I); Initialized = true; } @@ -1328,7 +1325,6 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) { LoopPreds.clear(); LoopConds.clear(); FlowSet.clear(); - TermDL.clear(); return true; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2532edc5d8699..d7aa0096dfebf 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -791,7 +791,7 @@ isFixedVectorShuffle(ArrayRef VL, SmallVectorImpl &Mask, } /// \returns True if Extract{Value,Element} instruction extracts element Idx. -static std::optional getExtractIndex(Instruction *E) { +static std::optional getExtractIndex(const Instruction *E) { unsigned Opcode = E->getOpcode(); assert((Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && @@ -12799,25 +12799,47 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef Mask, - bool) { + bool ForSingleMask) { InstructionCost C = 0; unsigned VF = Mask.size(); unsigned VecVF = TE->getVectorFactor(); - if (VF != VecVF && - (any_of(Mask, [VF](int Idx) { return Idx >= static_cast(VF); }) || - !ShuffleVectorInst::isIdentityMask(Mask, VF))) { - SmallVector OrigMask(VecVF, PoisonMaskElem); - std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)), - OrigMask.begin()); - C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, - getWidenedType(TE->getMainOp()->getType(), VecVF), - OrigMask); - LLVM_DEBUG( - dbgs() << "SLP: Adding cost " << C - << " for final shuffle of insertelement external users.\n"; - TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); - Cost += C; - return std::make_pair(TE, true); + bool HasLargeIndex = + any_of(Mask, [VF](int Idx) { return Idx >= static_cast(VF); }); + if ((VF != VecVF && HasLargeIndex) || + !ShuffleVectorInst::isIdentityMask(Mask, VF)) { + + if (HasLargeIndex) { + SmallVector OrigMask(VecVF, PoisonMaskElem); + std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)), + OrigMask.begin()); + C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, + getWidenedType(TE->getMainOp()->getType(), VecVF), + OrigMask); + LLVM_DEBUG( + dbgs() << "SLP: Adding cost " << C + << " for final shuffle of insertelement external users.\n"; + TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + return std::make_pair(TE, true); + } + + if (!ForSingleMask) { + SmallVector ResizeMask(VF, PoisonMaskElem); + for (unsigned I = 0; I < VF; ++I) { + if (Mask[I] != PoisonMaskElem) + ResizeMask[Mask[I]] = Mask[I]; + } + if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF)) + C = ::getShuffleCost( + *TTI, TTI::SK_PermuteSingleSrc, + getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask); + LLVM_DEBUG( + dbgs() << "SLP: Adding cost " << C + << " for final shuffle of insertelement external users.\n"; + TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); + + Cost += C; + } } return std::make_pair(TE, false); }; @@ -21531,8 +21553,38 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (NodeI1 != NodeI2) return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); InstructionsState S = getSameOpcode({I1, I2}, *TLI); - if (S && !S.isAltShuffle()) + if (S && !S.isAltShuffle()) { + const auto *E1 = dyn_cast(I1); + const auto *E2 = dyn_cast(I2); + if (!E1 || !E2) + continue; + + // Sort on ExtractElementInsts primarily by vector operands. Prefer + // program order of the vector operands. + const auto *V1 = dyn_cast(E1->getVectorOperand()); + const auto *V2 = dyn_cast(E2->getVectorOperand()); + if (V1 != V2) { + if (!V1 || !V2) + continue; + if (V1->getParent() != V2->getParent()) + continue; + return V1->comesBefore(V2); + } + // If we have the same vector operand, try to sort by constant + // index. + std::optional Id1 = getExtractIndex(E1); + std::optional Id2 = getExtractIndex(E2); + // Bring constants to the top + if (Id1 && !Id2) + return true; + if (!Id1 && Id2) + return false; + // First elements come first. + if (Id1 && Id2) + return *Id1 < *Id2; + continue; + } return I1->getOpcode() < I2->getOpcode(); } if (I1) diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt index 2b11293c0f870..f070cd767f2e1 100644 --- a/llvm/runtimes/CMakeLists.txt +++ b/llvm/runtimes/CMakeLists.txt @@ -506,6 +506,9 @@ if(build_runtimes) if("offload" IN_LIST LLVM_ENABLE_RUNTIMES) # With ROCm 6.3 the ROCr runtime and the thunk layer share a single repository. # No need to provide a separate path for ROCt. + if (OFFLOAD_EXTERNAL_PROJECT_UNIFIED_ROCR OR DEFINED LIBOMPTARGET_EXTERNAL_PROJECT_ROCM_DEVICE_LIBS_PATH) + list(APPEND extra_cmake_args "-DCMAKE_PREFIX_PATH=${CMAKE_BINARY_DIR}/lib/cmake$${CMAKE_PREFIX_PATH}") + endif() if (OFFLOAD_EXTERNAL_PROJECT_UNIFIED_ROCR) if(NOT DEFINED LIBOMPTARGET_EXTERNAL_PROJECT_HSA_PATH) message(SEND_ERROR "External ROCr requires setting LIBOMPTARGET_EXTERNAL_PROJECT_HSA_PATH") @@ -517,45 +520,36 @@ if(build_runtimes) DEPENDS clang llvm-link lld opt llvm-objcopy INSTALL_COMMAND "" CMAKE_ARGS -DBUILD_SHARED_LIBS=ON - -DCMAKE_PREFIX_PATH=${CMAKE_BINARY_DIR}/lib/cmake -DIMAGE_SUPPORT=OFF - -DLLVM_RUNTIME_OPENMP=ON) + -DLLVM_RUNTIME_OPENMP=ON + ${extra_cmake_args}) set(HSA_DEP rocr-runtime) + endif() - # omptarget device RTL depends on device libs, leading to circular dependency in build scripts. - # Providing path to the sources enables to build them as part of compiler build, which - # removes the ciruclar dependency on the script-side. - if (DEFINED LIBOMPTARGET_EXTERNAL_PROJECT_ROCM_DEVICE_LIBS_PATH) - message(STATUS "Add external AMD device-libs: ${LIBOMPTARGET_EXTERNAL_PROJECT_ROCM_DEVICE_LIBS_PATH}") + # omptarget device RTL depends on device libs, leading to circular dependency in build scripts. + # Providing path to the sources enables to build them as part of compiler build, which + # removes the ciruclar dependency on the script-side. + if (DEFINED LIBOMPTARGET_EXTERNAL_PROJECT_ROCM_DEVICE_LIBS_PATH) + message(STATUS "Add external AMD device-libs: ${LIBOMPTARGET_EXTERNAL_PROJECT_ROCM_DEVICE_LIBS_PATH}") + if (NOT ${ROCM_DEVICE_LIBS_INSTALL_PREFIX_PATH} STREQUAL "") ExternalProject_Add(rocm-device-libs SOURCE_DIR ${LIBOMPTARGET_EXTERNAL_PROJECT_ROCM_DEVICE_LIBS_PATH} DEPENDS clang llvm-link lld opt llvm-objcopy CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ROCM_DEVICE_LIBS_INSTALL_PREFIX_PATH} - -DCMAKE_PREFIX_PATH=${CMAKE_BINARY_DIR}/lib/cmake + -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_NEW=${ROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC} + -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_OLD=amdgcn + ${extra_cmake_args}) + else() + ExternalProject_Add(rocm-device-libs + SOURCE_DIR ${LIBOMPTARGET_EXTERNAL_PROJECT_ROCM_DEVICE_LIBS_PATH} + DEPENDS clang llvm-link lld opt llvm-objcopy + INSTALL_COMMAND "" + CMAKE_ARGS -DCMAKE_PREFIX_PATH=${CMAKE_BINARY_DIR}/lib/cmake -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_NEW=${ROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC} -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_OLD=amdgcn) endif() endif() - if(DEFINED LIBOMPTARGET_EXTERNAL_PROJECT_HSA_PATH AND - DEFINED LIBOMPTARGET_EXTERNAL_PROJECT_THUNK_PATH AND - DEFINED LIBOMPTARGET_EXTERNAL_PROJECT_ROCM_DEVICE_LIBS_PATH) - ExternalProject_Add(roct - SOURCE_DIR ${LIBOMPTARGET_EXTERNAL_PROJECT_THUNK_PATH} - DEPENDS clang llvm-link lld opt llvm-objcopy - INSTALL_COMMAND "" - CMAKE_ARGS -DCMAKE_PREFIX_PATH=${CMAKE_BINARY_DIR}/lib/cmake) - ExternalProject_Add(rocr-runtime - SOURCE_DIR ${LIBOMPTARGET_EXTERNAL_PROJECT_HSA_PATH} - DEPENDS clang llvm-link lld opt llvm-objcopy roct - INSTALL_COMMAND "" - CMAKE_ARGS -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH=${CMAKE_BINARY_DIR}/lib/cmake -DIMAGE_SUPPORT=OFF -DLLVM_RUNTIME_OPENMP=ON -DHSAKMT_SRC_PATH=${LIBOMPTARGET_EXTERNAL_PROJECT_THUNK_PATH}) - set(HSA_DEP rocr-runtime) - ExternalProject_Add(rocm-device-libs - SOURCE_DIR ${LIBOMPTARGET_EXTERNAL_PROJECT_ROCM_DEVICE_LIBS_PATH} - DEPENDS clang llvm-link lld opt llvm-objcopy - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ROCM_DEVICE_LIBS_INSTALL_PREFIX_PATH} -DCMAKE_PREFIX_PATH=${CMAKE_BINARY_DIR}/lib/cmake -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_NEW=${ROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC} -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_OLD=amdgcn) - endif() if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD) set(AMDGPU_ARCH_DEP amdgpu-arch) endif() diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir index 9d15b8990bad3..6e1b5d641a8b7 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform.mir @@ -12,7 +12,7 @@ body: | ; CHECK-NEXT: ALL VALUES UNIFORM %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec + %2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec %3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec $sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec $sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec @@ -33,7 +33,7 @@ body: | %4:sgpr_32 = V_READLANE_B32 $vgpr0, 0, implicit $exec $sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec $sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec - %5:sgpr_32 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec + %5:sreg_32_xm0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir index 9a7e755e5f5c8..f7c874be87d36 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir @@ -14,7 +14,7 @@ body: | %0:vreg_64 = IMPLICIT_DEF %1:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32)) %2:vgpr_32(s32) = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1) - %3:sreg_32 = V_READFIRSTLANE_B32 %1(s32), implicit $exec + %3:sreg_32_xm0 = V_READFIRSTLANE_B32 %1(s32), implicit $exec S_ENDPGM 0 ... @@ -50,7 +50,7 @@ body: | %1:vreg_64 = IMPLICIT_DEF %2:vgpr_32(s32) = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32)) %3:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - %4:sreg_32 = V_READFIRSTLANE_B32 %2(s32), implicit $exec + %4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2(s32), implicit $exec S_ENDPGM 0 ... @@ -104,7 +104,7 @@ body: | %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF - %2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec + %2:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec %3:sgpr_32 = V_READLANE_B32 %1, 0, implicit $exec $sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec $sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec diff --git a/llvm/test/Assembler/DIExpressionNewDebugRecords.ll b/llvm/test/Assembler/DIExpressionNewDebugRecords.ll new file mode 100644 index 0000000000000..abb7008653502 --- /dev/null +++ b/llvm/test/Assembler/DIExpressionNewDebugRecords.ll @@ -0,0 +1,28 @@ +; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s + +; CHECK: %struct.S = type { i32 } +%struct.S = type { i32 } + +define dso_local i32 @f() !dbg !7 { +entry: + ; CHECK: #dbg_value(ptr null, !9, !DIExpression(DIOpArg(0, ptr), DIOpDeref(%struct.S)), !11) + #dbg_value(ptr null, !9, !DIExpression(DIOpArg(0, ptr), DIOpDeref(%struct.S)), !11) + ret i32 0, !dbg !11 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 18.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None) +!1 = !DIFile(filename: "print.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 5} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 18.0.0"} +!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 3, type: !8, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!8 = !DISubroutineType(types: !2) +!9 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 3, type: !10) +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !DILocation(line: 3, column: 15, scope: !7) diff --git a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll index 6b12d954b9d1c..43d49da1abd21 100644 --- a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll +++ b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll @@ -13,25 +13,26 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define i64 @bfis_in_loop_zero() { ; CHECK-LABEL: bfis_in_loop_zero: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adrp x8, :got:global -; CHECK-NEXT: mov x0, xzr -; CHECK-NEXT: mov w9, wzr -; CHECK-NEXT: ldr x8, [x8, :got_lo12:global] -; CHECK-NEXT: ldr x8, [x8] -; CHECK-NEXT: .LBB0_1: // %midblock -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrh w10, [x8, #72] -; CHECK-NEXT: ldr x13, [x8, #8] -; CHECK-NEXT: lsr w11, w10, #8 -; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: ldr x8, [x13, #16] -; CHECK-NEXT: cset w12, ne -; CHECK-NEXT: csel w9, w9, w11, eq -; CHECK-NEXT: and x11, x0, #0xffffffff00000000 -; CHECK-NEXT: bfi w10, w9, #8, #24 -; CHECK-NEXT: orr x11, x11, x12, lsl #16 -; CHECK-NEXT: orr x0, x11, x10 -; CHECK-NEXT: cbnz x13, .LBB0_1 +; CHECK-NEXT: adrp x9, :got:global +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ldr x9, [x9, :got_lo12:global] +; CHECK-NEXT: mov w10, #65536 // =0x10000 +; CHECK-NEXT: ldr x9, [x9] +; CHECK-NEXT: .LBB0_1: // %midblock +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldrh w11, [x9, #72] +; CHECK-NEXT: and x13, x0, #0xffffffff00000000 +; CHECK-NEXT: lsr w12, w11, #8 +; CHECK-NEXT: cmp w11, #0 +; CHECK-NEXT: csel w8, w8, w12, eq +; CHECK-NEXT: ldr x12, [x9, #8] +; CHECK-NEXT: csel x9, xzr, x10, eq +; CHECK-NEXT: bfi w11, w8, #8, #24 +; CHECK-NEXT: orr x13, x9, x13 +; CHECK-NEXT: ldr x9, [x12, #16] +; CHECK-NEXT: orr x0, x13, x11 +; CHECK-NEXT: cbnz x12, .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret entry: @@ -80,25 +81,26 @@ exit: define i64 @bfis_in_loop_undef() { ; CHECK-LABEL: bfis_in_loop_undef: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adrp x9, :got:global -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: // implicit-def: $x0 -; CHECK-NEXT: ldr x9, [x9, :got_lo12:global] -; CHECK-NEXT: ldr x9, [x9] -; CHECK-NEXT: .LBB1_1: // %midblock -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrh w10, [x9, #72] -; CHECK-NEXT: ldr x13, [x9, #8] -; CHECK-NEXT: lsr w11, w10, #8 -; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: ldr x9, [x13, #16] -; CHECK-NEXT: cset w12, ne -; CHECK-NEXT: csel w8, w8, w11, eq -; CHECK-NEXT: and x11, x0, #0xffffffff00000000 -; CHECK-NEXT: bfi w10, w8, #8, #24 -; CHECK-NEXT: orr x11, x11, x12, lsl #16 -; CHECK-NEXT: orr x0, x11, x10 -; CHECK-NEXT: cbnz x13, .LBB1_1 +; CHECK-NEXT: adrp x9, :got:global +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: // implicit-def: $x0 +; CHECK-NEXT: ldr x9, [x9, :got_lo12:global] +; CHECK-NEXT: ldr x10, [x9] +; CHECK-NEXT: mov w9, #65536 // =0x10000 +; CHECK-NEXT: .LBB1_1: // %midblock +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldrh w11, [x10, #72] +; CHECK-NEXT: and x13, x0, #0xffffffff00000000 +; CHECK-NEXT: lsr w12, w11, #8 +; CHECK-NEXT: cmp w11, #0 +; CHECK-NEXT: csel w8, w8, w12, eq +; CHECK-NEXT: ldr x12, [x10, #8] +; CHECK-NEXT: csel x10, xzr, x9, eq +; CHECK-NEXT: bfi w11, w8, #8, #24 +; CHECK-NEXT: orr x13, x10, x13 +; CHECK-NEXT: ldr x10, [x12, #16] +; CHECK-NEXT: orr x0, x13, x11 +; CHECK-NEXT: cbnz x12, .LBB1_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/select_cc.ll b/llvm/test/CodeGen/AArch64/select_cc.ll index 92c8087518151..70063eff04a70 100644 --- a/llvm/test/CodeGen/AArch64/select_cc.ll +++ b/llvm/test/CodeGen/AArch64/select_cc.ll @@ -2,12 +2,19 @@ ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s define i64 @select_ogt_float(float %a, float %b) { -; CHECK-LABEL: select_ogt_float: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w8, gt -; CHECK-NEXT: ubfiz x0, x8, #2, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: select_ogt_float: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcmp s0, s1 +; CHECK-SD-NEXT: mov w8, #4 // =0x4 +; CHECK-SD-NEXT: csel x0, x8, xzr, gt +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: select_ogt_float: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fcmp s0, s1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: lsl x0, x8, #2 +; CHECK-GI-NEXT: ret entry: %cc = fcmp ogt float %a, %b %sel = select i1 %cc, i64 4, i64 0 @@ -15,12 +22,19 @@ entry: } define i64 @select_ule_float_inverse(float %a, float %b) { -; CHECK-LABEL: select_ule_float_inverse: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: cset w8, gt -; CHECK-NEXT: ubfiz x0, x8, #2, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: select_ule_float_inverse: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcmp s0, s1 +; CHECK-SD-NEXT: mov w8, #4 // =0x4 +; CHECK-SD-NEXT: csel x0, xzr, x8, le +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: select_ule_float_inverse: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fcmp s0, s1 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: lsl x0, x8, #2 +; CHECK-GI-NEXT: ret entry: %cc = fcmp ule float %a, %b %sel = select i1 %cc, i64 0, i64 4 @@ -28,12 +42,19 @@ entry: } define i64 @select_eq_i32(i32 %a, i32 %b) { -; CHECK-LABEL: select_eq_i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w0, w1 -; CHECK-NEXT: cset w8, eq -; CHECK-NEXT: ubfiz x0, x8, #2, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: select_eq_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #4 // =0x4 +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: csel x0, x8, xzr, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: select_eq_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: lsl x0, x8, #2 +; CHECK-GI-NEXT: ret entry: %cc = icmp eq i32 %a, %b %sel = select i1 %cc, i64 4, i64 0 @@ -41,12 +62,19 @@ entry: } define i64 @select_ne_i32_inverse(i32 %a, i32 %b) { -; CHECK-LABEL: select_ne_i32_inverse: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w0, w1 -; CHECK-NEXT: cset w8, eq -; CHECK-NEXT: ubfiz x0, x8, #2, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: select_ne_i32_inverse: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #4 // =0x4 +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: csel x0, xzr, x8, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: select_ne_i32_inverse: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, eq +; CHECK-GI-NEXT: lsl x0, x8, #2 +; CHECK-GI-NEXT: ret entry: %cc = icmp ne i32 %a, %b %sel = select i1 %cc, i64 0, i64 4 diff --git a/llvm/test/CodeGen/AArch64/selectopt-const.ll b/llvm/test/CodeGen/AArch64/selectopt-const.ll index f10327e136ad1..a44c746e0f281 100644 --- a/llvm/test/CodeGen/AArch64/selectopt-const.ll +++ b/llvm/test/CodeGen/AArch64/selectopt-const.ll @@ -13,24 +13,24 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) { ; CHECK-NEXT: mov w8, w3 ; CHECK-NEXT: movk w9, #16309, lsl #16 ; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: mov w9, #16 // =0x10 ; CHECK-NEXT: .p2align 5, , 16 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr s4, [x1], #4 -; CHECK-NEXT: ldr w9, [x0], #4 -; CHECK-NEXT: add w9, w9, #10 -; CHECK-NEXT: scvtf d3, w9 +; CHECK-NEXT: ldr w10, [x0], #4 +; CHECK-NEXT: add w10, w10, #10 +; CHECK-NEXT: scvtf d3, w10 ; CHECK-NEXT: fmadd s4, s4, s0, s1 ; CHECK-NEXT: fabs s4, s4 ; CHECK-NEXT: fcvt d4, s4 ; CHECK-NEXT: fdiv d3, d3, d4 ; CHECK-NEXT: fcmp d3, d2 -; CHECK-NEXT: cset w9, lt +; CHECK-NEXT: csel x10, x9, xzr, lt ; CHECK-NEXT: subs x8, x8, #1 -; CHECK-NEXT: ubfiz x9, x9, #4, #32 -; CHECK-NEXT: ldr s3, [x4, x9] -; CHECK-NEXT: fcvtzs w9, s3 -; CHECK-NEXT: str w9, [x2], #4 +; CHECK-NEXT: ldr s3, [x4, x10] +; CHECK-NEXT: fcvtzs w10, s3 +; CHECK-NEXT: str w10, [x2], #4 ; CHECK-NEXT: b.ne .LBB0_2 ; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup ; CHECK-NEXT: mov w0, wzr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 23f24a9dc9982..424388a30e99b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -624,7 +624,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -780,7 +779,6 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1212,7 +1210,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -1366,7 +1363,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1822,14 +1818,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -1994,14 +1988,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 11024b0a88d6b..b52a39f1a55c8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -624,7 +624,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -780,7 +779,6 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1212,7 +1210,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -1366,7 +1363,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1822,14 +1818,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -1994,14 +1988,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index d62da6921b347..e1397e7331d3c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -2853,6 +2853,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: flat_store_b32 v[0:1], v3 @@ -3840,6 +3841,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll index 7c4069b4b3138..1e646eef51449 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f64.ll @@ -104,9 +104,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32 ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) @@ -131,9 +131,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32> ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -158,9 +158,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32> ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -187,9 +187,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32 ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -298,9 +298,9 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) @@ -325,9 +325,9 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr a ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -352,9 +352,9 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr a ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -381,9 +381,9 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr ; GFX90A_GFX940-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index e7ddfda2875db..1ef62fe127995 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -199,33 +199,31 @@ define amdgpu_kernel void @break_loop(i32 %arg) { ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_subrev_u32_e32 v0, s0, v0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 -; CHECK-NEXT: s_branch .LBB5_3 -; CHECK-NEXT: .LBB5_1: ; %bb4 -; CHECK-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; CHECK-NEXT: global_load_dword v2, v[0:1], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec -; CHECK-NEXT: v_cmp_ge_i32_e32 vcc, v0, v2 -; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc -; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; CHECK-NEXT: .LBB5_2: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB5_3 Depth=1 +; CHECK-NEXT: s_branch .LBB5_2 +; CHECK-NEXT: .LBB5_1: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; CHECK-NEXT: s_and_b64 s[4:5], exec, s[2:3] ; CHECK-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] -; CHECK-NEXT: s_cbranch_execz .LBB5_5 -; CHECK-NEXT: .LBB5_3: ; %bb1 +; CHECK-NEXT: s_cbranch_execz .LBB5_4 +; CHECK-NEXT: .LBB5_2: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_u32_e32 v1, 1, v1 ; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; CHECK-NEXT: s_and_b64 s[4:5], exec, -1 ; CHECK-NEXT: v_cmp_le_i32_e32 vcc, 0, v1 ; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; CHECK-NEXT: s_cbranch_vccz .LBB5_1 -; CHECK-NEXT: ; %bb.4: ; in Loop: Header=BB5_3 Depth=1 -; CHECK-NEXT: ; implicit-def: $vgpr1 -; CHECK-NEXT: s_branch .LBB5_2 -; CHECK-NEXT: .LBB5_5: ; %bb9 +; CHECK-NEXT: s_cbranch_vccnz .LBB5_1 +; CHECK-NEXT: ; %bb.3: ; %bb4 +; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1 +; CHECK-NEXT: global_load_dword v2, v[0:1], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; CHECK-NEXT: v_cmp_ge_i32_e32 vcc, v0, v2 +; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc +; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; CHECK-NEXT: s_branch .LBB5_1 +; CHECK-NEXT: .LBB5_4: ; %bb9 ; CHECK-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll index 43f3dcc86f426..a948446aceff1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -28,9 +28,11 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, ptr addrspace(1) %ptr %elt = extractelement <64 x i32> %vec, i32 %idx @@ -60,9 +62,11 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: global_load_u16 v0, v[0:1], off ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %vec = load <128 x i16>, ptr addrspace(1) %ptr %elt = extractelement <128 x i16> %vec, i32 %idx @@ -92,9 +96,11 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %vec = load <32 x i64>, ptr addrspace(1) %ptr %elt = extractelement <32 x i64> %vec, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll index c9ab351f94016..a2c024a84b568 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll @@ -33,9 +33,9 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.mir index 8cc1e608687fd..3f16aca150671 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.mir @@ -20,7 +20,7 @@ body: | ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]] ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]] ; GFX9-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIX_F32 9, [[COPY3]], 8, [[COPY3]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_FMA_MIX_F32_]], implicit $exec + ; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_FMA_MIX_F32_]], implicit $exec ; GFX9-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %0:sgpr(s32) = COPY $sgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll index 6459110dd8bbb..0f9a6407a4429 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -103,7 +103,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] ; GFX908-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] ; GFX908-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY17]], implicit $exec ; GFX908-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] @@ -175,7 +175,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] ; GFX90A-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] ; GFX90A-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY17]], implicit $exec ; GFX90A-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] @@ -247,7 +247,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] ; GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] ; GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY17]], implicit $exec ; GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll index e935245e30f12..68cdfc23598e7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -97,7 +97,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX90A-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] ; GFX90A-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] ; GFX90A-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec ; GFX90A-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] @@ -124,7 +124,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 ; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX90A-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec ; GFX90A-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY20]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec @@ -188,7 +188,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX940-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] ; GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] ; GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec ; GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] @@ -215,7 +215,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 ; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX940-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec ; GFX940-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY20]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec @@ -274,11 +274,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX11-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 15 - ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_3]] + ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_3]] ; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READLANE_B32_]], [[S_MOV_B32_4]], [[V_MOV_B32_dpp5]] ; GFX11-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 31 - ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_5]] + ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_5]] ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_1]] ; GFX11-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY14]], implicit $exec ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] @@ -305,7 +305,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX11-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec ; GFX11-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY16]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll index 49c5dc7ed5a96..5400143a00639 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll @@ -85,9 +85,9 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX90A-NEXT: SI_END_CF [[PHI3]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ; @@ -104,9 +104,9 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic @@ -146,9 +146,9 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir index 54c1898a8a633..00d8abb0c2c32 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir @@ -17,7 +17,7 @@ body: | ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_READFIRSTLANE_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index b26ddbdd7a342..5cba777959d8b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -625,7 +625,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ret <4 x float> %r } -define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 @@ -739,7 +739,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ret void } -define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 @@ -843,7 +843,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ret void } -define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) { ; GFX10-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 @@ -925,7 +925,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ret void } -define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) { ; GFX10-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll index 4a151aeca87e4..316d7fad91460 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll @@ -13,16 +13,16 @@ define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) { ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_2]], implicit-def dead $scc ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 0, i32 1234, i32 5678) @@ -62,16 +62,16 @@ define amdgpu_ps ptr addrspace(8) @basic_struct_buffer(ptr inreg %p) { ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_MOV_B32_3]], implicit-def dead $scc ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 1234, i32 5678) @@ -92,16 +92,16 @@ define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i32 inreg %nu ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 %numVals, i32 %flags) @@ -124,16 +124,16 @@ define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride, ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], [[S_MOV_B32_1]], implicit-def dead $scc ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_LSHL_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) @@ -192,10 +192,10 @@ define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i3 ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_AND_OR_B32_e64_]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_AND_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll index 570a39d0fa5fb..835fb468bfea4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll @@ -184,10 +184,10 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -196,7 +196,7 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -237,10 +237,10 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -249,7 +249,7 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -296,10 +296,10 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -308,7 +308,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -348,10 +348,10 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -360,7 +360,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll index c2799e5836a97..4973129ed3370 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll @@ -110,10 +110,10 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -122,7 +122,7 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -167,10 +167,10 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -179,7 +179,7 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -230,10 +230,10 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -242,7 +242,7 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -285,10 +285,10 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -297,7 +297,7 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -389,9 +389,9 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1 ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0 ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1 - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ; @@ -417,9 +417,9 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1 ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0 ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1 - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -502,10 +502,10 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; GFX8-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 @@ -514,7 +514,7 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -536,9 +536,9 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0 ; GFX8-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1 - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ; @@ -568,10 +568,10 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 @@ -580,7 +580,7 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -602,9 +602,9 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0 ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1 - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -640,10 +640,10 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__ ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; GFX8-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 @@ -652,7 +652,7 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__ ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -699,10 +699,10 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__ ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 @@ -711,7 +711,7 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -758,9 +758,9 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1 ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0 ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1 - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ; @@ -786,9 +786,9 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1 ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0 ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1 - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %voffset = add i32 %voffset.base, 4095 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll index c96fc017ae936..b1846b8dbebc9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -159,10 +159,10 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -171,7 +171,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -211,10 +211,10 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -223,7 +223,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -266,10 +266,10 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -278,7 +278,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -316,10 +316,10 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -328,7 +328,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll index 36d5e914d40be..1977712c56e36 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll @@ -215,10 +215,10 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -227,7 +227,7 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -266,10 +266,10 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -278,7 +278,7 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -317,10 +317,10 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -329,7 +329,7 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll index baeb5909f04e8..f0983501df293 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll @@ -186,13 +186,13 @@ define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffse ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3 - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 ; @@ -212,13 +212,13 @@ define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffse ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3 - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 %val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -245,10 +245,10 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -257,7 +257,7 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -296,10 +296,10 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -308,7 +308,7 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -398,13 +398,13 @@ define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffse ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3 - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 ; @@ -424,13 +424,13 @@ define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffse ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3 - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 %voffset = add i32 %voffset.base, 4095 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll index 102a9bd840b09..8e167b9df5749 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll @@ -94,10 +94,10 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -141,10 +141,10 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -193,10 +193,10 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -205,7 +205,7 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -243,10 +243,10 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -255,7 +255,7 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -899,10 +899,10 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -946,10 +946,10 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -998,10 +998,10 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -1045,10 +1045,10 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -1492,10 +1492,10 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -1541,10 +1541,10 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -1597,10 +1597,10 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -1644,10 +1644,10 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll index 6541085b72e54..7398e347e3397 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll @@ -238,10 +238,10 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -287,10 +287,10 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -336,10 +336,10 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -673,10 +673,10 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -725,10 +725,10 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -774,10 +774,10 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll index 1f89150f09ced..28de527ba7f2a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll @@ -212,10 +212,10 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -263,10 +263,10 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -519,10 +519,10 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -570,10 +570,10 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll index 030f8dae0ef79..8160ba4932055 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -97,10 +97,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -144,10 +144,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -196,7 +196,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec ; GFX8-NEXT: [[COPY7:%[0-9]+]]:sreg_64_xexec = COPY [[V_CMP_EQ_U32_e64_]] ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[COPY7]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -234,7 +234,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[V_CMP_EQ_U32_e64_]] ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[COPY7]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -277,10 +277,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -289,7 +289,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -327,10 +327,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -339,7 +339,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -917,10 +917,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -966,10 +966,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -1371,10 +1371,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -1418,10 +1418,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -1472,10 +1472,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -1518,10 +1518,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll index 93d68443c7843..d7844c52a51af 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll @@ -113,10 +113,10 @@ define amdgpu_ps float @raw_ptr_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -125,7 +125,7 @@ define amdgpu_ps float @raw_ptr_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -172,10 +172,10 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc_ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -184,7 +184,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc_ ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll index 56b2d0452dd45..3852a02cbf360 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll @@ -72,10 +72,10 @@ define amdgpu_ps float @raw_ptr_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__v ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -84,7 +84,7 @@ define amdgpu_ps float @raw_ptr_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__v ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -135,10 +135,10 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_c ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -147,7 +147,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_c ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -220,9 +220,9 @@ define amdgpu_ps double @raw_ptr_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -284,10 +284,10 @@ define amdgpu_ps double @raw_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 @@ -296,7 +296,7 @@ define amdgpu_ps double @raw_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__ ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -318,9 +318,9 @@ define amdgpu_ps double @raw_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__ ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0 ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -356,10 +356,10 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_c ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 @@ -368,7 +368,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_c ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -415,9 +415,9 @@ define amdgpu_ps double @raw_ptr_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %voffset = add i32 %voffset.base, 4095 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll index 798a3ee1d75fd..076c12e0d5bc7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll @@ -8,7 +8,7 @@ declare <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half>, ptr ; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn: ; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc -define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 %soffset) { +define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) { main_body: %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) store float %ret, ptr undef @@ -17,7 +17,7 @@ main_body: ; GFX90A-LABEL: {{^}}buffer_atomic_add_v2f16_rtn: ; GFX90A: buffer_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc -define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) { main_body: %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) store <2 x half> %ret, ptr undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll index 999f42ff905ab..42c0749af5f6e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll @@ -159,10 +159,10 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -171,7 +171,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -211,10 +211,10 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -223,7 +223,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -266,10 +266,10 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -278,7 +278,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -316,10 +316,10 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -328,7 +328,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll index 5b19b1c913a94..cf059da089e50 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll @@ -166,10 +166,10 @@ define amdgpu_ps half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__ ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -178,7 +178,7 @@ define amdgpu_ps half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__ ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -217,10 +217,10 @@ define amdgpu_ps half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__ ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -229,7 +229,7 @@ define amdgpu_ps half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__ ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll index 2dc688db86e4f..d9c61674d2df5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll @@ -113,10 +113,10 @@ define amdgpu_ps float @raw_ptr_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset_ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -125,7 +125,7 @@ define amdgpu_ps float @raw_ptr_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset_ ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll index 7b8b028128dd3..06259815a9223 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -62,10 +62,10 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -114,10 +114,10 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_s ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -126,7 +126,7 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_s ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -531,10 +531,10 @@ define amdgpu_ps half @raw_ptr_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_so ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -583,10 +583,10 @@ define amdgpu_ps float @raw_ptr_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_so ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -862,10 +862,10 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -918,10 +918,10 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll index 3ed6bbdd36156..fa0af33281ed4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll @@ -176,10 +176,10 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 @@ -225,10 +225,10 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 @@ -487,10 +487,10 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 @@ -539,10 +539,10 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll index dee83a9b0a6ec..fb974a835164a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll @@ -128,10 +128,10 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 @@ -299,10 +299,10 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll index 2c99ce8694bcc..ec0bd1f9ca4ea 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -64,10 +64,10 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -116,7 +116,7 @@ define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_64_xexec = COPY [[V_CMP_EQ_U32_e64_]] ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[COPY7]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -159,10 +159,10 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -171,7 +171,7 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -532,10 +532,10 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 @@ -789,10 +789,10 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -843,10 +843,10 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll index a799e203d6439..cb4fd294730e1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll @@ -164,10 +164,10 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -176,7 +176,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -215,10 +215,10 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -227,7 +227,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll index 3e135472ebbb1..615543cf7ed51 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll @@ -112,10 +112,10 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -124,7 +124,7 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll index 725faa1b4a49f..99bc50eaf3a06 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll @@ -143,10 +143,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -190,10 +190,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -242,10 +242,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -254,7 +254,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -292,10 +292,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -304,7 +304,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -348,10 +348,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -360,7 +360,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -399,10 +399,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -411,7 +411,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll index a12a6005df24e..cc70c27aa48f1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll @@ -56,10 +56,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -103,10 +103,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -155,10 +155,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -167,7 +167,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -205,10 +205,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -217,7 +217,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -261,10 +261,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -273,7 +273,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -312,10 +312,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -324,7 +324,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll index 9db5c160a6236..5092060602bd4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll @@ -132,10 +132,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -184,10 +184,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -196,7 +196,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -240,10 +240,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -252,7 +252,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -602,10 +602,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -658,10 +658,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll index 1cfb15391be36..0850fdfd3f495 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll @@ -213,10 +213,10 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -225,7 +225,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -264,10 +264,10 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -276,7 +276,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -315,10 +315,10 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -327,7 +327,7 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll index 4f8b20d10c874..f6670baea089c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll @@ -188,10 +188,10 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX10_GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX10_GFX11-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX10_GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -200,7 +200,7 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX10_GFX11-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX10_GFX11-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX10_GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX10_GFX11-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX10_GFX11-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -239,10 +239,10 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -251,7 +251,7 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll index b9d0cb52d2405..cb622d250df3c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll @@ -191,10 +191,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -238,10 +238,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -285,10 +285,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -337,10 +337,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -349,7 +349,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -387,10 +387,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -399,7 +399,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -437,10 +437,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -449,7 +449,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -493,10 +493,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -505,7 +505,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -544,10 +544,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -556,7 +556,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -595,10 +595,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -607,7 +607,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll index c1fb4aacafe1d..1e61db7acff64 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll @@ -72,10 +72,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -119,10 +119,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -166,10 +166,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -218,10 +218,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -230,7 +230,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -268,10 +268,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -280,7 +280,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -318,10 +318,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -330,7 +330,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -374,10 +374,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -386,7 +386,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -425,10 +425,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -437,7 +437,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -476,10 +476,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -488,7 +488,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll index 09227af922a6e..8d82772044794 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll @@ -218,10 +218,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX10_GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX10_GFX11-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX10_GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -265,10 +265,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -317,10 +317,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX10_GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX10_GFX11-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX10_GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -329,7 +329,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX10_GFX11-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX10_GFX11-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX10_GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX10_GFX11-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX10_GFX11-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -367,10 +367,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -379,7 +379,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -423,10 +423,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX10_GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX10_GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX10_GFX11-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -435,7 +435,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX10_GFX11-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX10_GFX11-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX10_GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX10_GFX11-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX10_GFX11-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -474,10 +474,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -486,7 +486,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1064,10 +1064,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX10_GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX10_GFX11-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX10_GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -1113,10 +1113,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -1169,10 +1169,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX10_GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX10_GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX10_GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX10_GFX11-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -1216,10 +1216,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll index 9ef54ed724ec0..bf8e10143003a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll @@ -171,6 +171,7 @@ define float @v_rsq_clamp_undef_f32() #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_s_rsq_f32 s0, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0xff7fffff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_minmax_num_f32 v0, s0, 0x7f7fffff, v0 ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll index 79b333c08cb2d..daa1923fb0d58 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -20,7 +20,7 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -36,7 +36,7 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -52,7 +52,7 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -68,7 +68,7 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -88,7 +88,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_glc(<4 x i32> inreg %rsrc, i32 inreg %so ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -104,7 +104,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_glc(<4 x i32> inreg %rsrc, i32 inreg %so ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -120,7 +120,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_glc(<4 x i32> inreg %rsrc, i32 inreg %so ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -136,7 +136,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_glc(<4 x i32> inreg %rsrc, i32 inreg %so ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[COPY4]], 0, 1 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 1) @@ -158,10 +158,10 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub0 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub1 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX6-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ; @@ -179,10 +179,10 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub0 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub1 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX7-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ; @@ -200,10 +200,10 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub0 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ; @@ -221,10 +221,10 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR_IMM]].sub0 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR_IMM]].sub1 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -248,13 +248,13 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX6-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub2 ; GFX6-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub3 ; GFX6-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX6-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX6-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX6-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 ; @@ -274,13 +274,13 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX7-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub2 ; GFX7-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub3 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX7-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX7-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX7-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 ; @@ -300,13 +300,13 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX8-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub2 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub3 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 ; @@ -325,13 +325,13 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX3_SGPR_IMM]].sub1 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX3_SGPR_IMM]].sub2 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -359,28 +359,28 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX6-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub6 ; GFX6-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub7 ; GFX6-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec ; GFX6-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX6-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec ; GFX6-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX6-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec ; GFX6-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; GFX6-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec ; GFX6-NEXT: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] ; GFX6-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec ; GFX6-NEXT: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] ; GFX6-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec ; GFX6-NEXT: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] ; GFX6-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec ; GFX6-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 ; @@ -404,28 +404,28 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX7-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub6 ; GFX7-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub7 ; GFX7-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec ; GFX7-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX7-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec ; GFX7-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX7-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec ; GFX7-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; GFX7-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec ; GFX7-NEXT: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] ; GFX7-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec ; GFX7-NEXT: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] ; GFX7-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec ; GFX7-NEXT: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] ; GFX7-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec ; GFX7-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 ; @@ -449,28 +449,28 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX8-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub6 ; GFX8-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub7 ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX8-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX8-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; GFX8-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec ; GFX8-NEXT: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] ; GFX8-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec ; GFX8-NEXT: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] ; GFX8-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec ; GFX8-NEXT: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] ; GFX8-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec ; GFX8-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 ; @@ -494,28 +494,28 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub6 ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR_IMM]].sub7 ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec ; GFX12-NEXT: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec ; GFX12-NEXT: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec ; GFX12-NEXT: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec ; GFX12-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -551,52 +551,52 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr ; GFX6-NEXT: [[COPY19:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub14 ; GFX6-NEXT: [[COPY20:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub15 ; GFX6-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY22]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY22]], implicit $exec ; GFX6-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX6-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY23]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY23]], implicit $exec ; GFX6-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX6-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec ; GFX6-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; GFX6-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec ; GFX6-NEXT: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] ; GFX6-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY26]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY26]], implicit $exec ; GFX6-NEXT: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] ; GFX6-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec ; GFX6-NEXT: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] ; GFX6-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec ; GFX6-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] ; GFX6-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY29]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY29]], implicit $exec ; GFX6-NEXT: $sgpr8 = COPY [[V_READFIRSTLANE_B32_8]] ; GFX6-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY30]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY30]], implicit $exec ; GFX6-NEXT: $sgpr9 = COPY [[V_READFIRSTLANE_B32_9]] ; GFX6-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[COPY15]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec ; GFX6-NEXT: $sgpr10 = COPY [[V_READFIRSTLANE_B32_10]] ; GFX6-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[COPY16]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec ; GFX6-NEXT: $sgpr11 = COPY [[V_READFIRSTLANE_B32_11]] ; GFX6-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_12:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY33]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_12:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY33]], implicit $exec ; GFX6-NEXT: $sgpr12 = COPY [[V_READFIRSTLANE_B32_12]] ; GFX6-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[COPY18]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_13:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY34]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_13:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY34]], implicit $exec ; GFX6-NEXT: $sgpr13 = COPY [[V_READFIRSTLANE_B32_13]] ; GFX6-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[COPY19]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_14:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY35]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_14:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY35]], implicit $exec ; GFX6-NEXT: $sgpr14 = COPY [[V_READFIRSTLANE_B32_14]] ; GFX6-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[COPY20]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec ; GFX6-NEXT: $sgpr15 = COPY [[V_READFIRSTLANE_B32_15]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 ; @@ -628,52 +628,52 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr ; GFX7-NEXT: [[COPY19:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub14 ; GFX7-NEXT: [[COPY20:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub15 ; GFX7-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY22]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY22]], implicit $exec ; GFX7-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX7-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY23]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY23]], implicit $exec ; GFX7-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX7-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec ; GFX7-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; GFX7-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec ; GFX7-NEXT: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] ; GFX7-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY26]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY26]], implicit $exec ; GFX7-NEXT: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] ; GFX7-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec ; GFX7-NEXT: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] ; GFX7-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec ; GFX7-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] ; GFX7-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY29]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY29]], implicit $exec ; GFX7-NEXT: $sgpr8 = COPY [[V_READFIRSTLANE_B32_8]] ; GFX7-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY30]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY30]], implicit $exec ; GFX7-NEXT: $sgpr9 = COPY [[V_READFIRSTLANE_B32_9]] ; GFX7-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[COPY15]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec ; GFX7-NEXT: $sgpr10 = COPY [[V_READFIRSTLANE_B32_10]] ; GFX7-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[COPY16]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec ; GFX7-NEXT: $sgpr11 = COPY [[V_READFIRSTLANE_B32_11]] ; GFX7-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_12:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY33]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_12:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY33]], implicit $exec ; GFX7-NEXT: $sgpr12 = COPY [[V_READFIRSTLANE_B32_12]] ; GFX7-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[COPY18]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_13:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY34]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_13:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY34]], implicit $exec ; GFX7-NEXT: $sgpr13 = COPY [[V_READFIRSTLANE_B32_13]] ; GFX7-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[COPY19]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_14:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY35]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_14:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY35]], implicit $exec ; GFX7-NEXT: $sgpr14 = COPY [[V_READFIRSTLANE_B32_14]] ; GFX7-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[COPY20]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec ; GFX7-NEXT: $sgpr15 = COPY [[V_READFIRSTLANE_B32_15]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 ; @@ -705,52 +705,52 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr ; GFX8-NEXT: [[COPY19:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub14 ; GFX8-NEXT: [[COPY20:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub15 ; GFX8-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY22]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY22]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX8-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY23]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY23]], implicit $exec ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX8-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; GFX8-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec ; GFX8-NEXT: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] ; GFX8-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY26]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY26]], implicit $exec ; GFX8-NEXT: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] ; GFX8-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec ; GFX8-NEXT: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] ; GFX8-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec ; GFX8-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] ; GFX8-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY29]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY29]], implicit $exec ; GFX8-NEXT: $sgpr8 = COPY [[V_READFIRSTLANE_B32_8]] ; GFX8-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY30]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY30]], implicit $exec ; GFX8-NEXT: $sgpr9 = COPY [[V_READFIRSTLANE_B32_9]] ; GFX8-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[COPY15]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec ; GFX8-NEXT: $sgpr10 = COPY [[V_READFIRSTLANE_B32_10]] ; GFX8-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[COPY16]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec ; GFX8-NEXT: $sgpr11 = COPY [[V_READFIRSTLANE_B32_11]] ; GFX8-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_12:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY33]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_12:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY33]], implicit $exec ; GFX8-NEXT: $sgpr12 = COPY [[V_READFIRSTLANE_B32_12]] ; GFX8-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[COPY18]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_13:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY34]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_13:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY34]], implicit $exec ; GFX8-NEXT: $sgpr13 = COPY [[V_READFIRSTLANE_B32_13]] ; GFX8-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[COPY19]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_14:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY35]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_14:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY35]], implicit $exec ; GFX8-NEXT: $sgpr14 = COPY [[V_READFIRSTLANE_B32_14]] ; GFX8-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[COPY20]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec ; GFX8-NEXT: $sgpr15 = COPY [[V_READFIRSTLANE_B32_15]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 ; @@ -782,52 +782,52 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr ; GFX12-NEXT: [[COPY19:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub14 ; GFX12-NEXT: [[COPY20:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR_IMM]].sub15 ; GFX12-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY22]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY22]], implicit $exec ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX12-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY23]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY23]], implicit $exec ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX12-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; GFX12-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec ; GFX12-NEXT: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] ; GFX12-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY26]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY26]], implicit $exec ; GFX12-NEXT: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] ; GFX12-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec ; GFX12-NEXT: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] ; GFX12-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec ; GFX12-NEXT: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] ; GFX12-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY29]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY29]], implicit $exec ; GFX12-NEXT: $sgpr8 = COPY [[V_READFIRSTLANE_B32_8]] ; GFX12-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY30]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY30]], implicit $exec ; GFX12-NEXT: $sgpr9 = COPY [[V_READFIRSTLANE_B32_9]] ; GFX12-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[COPY15]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec ; GFX12-NEXT: $sgpr10 = COPY [[V_READFIRSTLANE_B32_10]] ; GFX12-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[COPY16]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec ; GFX12-NEXT: $sgpr11 = COPY [[V_READFIRSTLANE_B32_11]] ; GFX12-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_12:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY33]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_12:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY33]], implicit $exec ; GFX12-NEXT: $sgpr12 = COPY [[V_READFIRSTLANE_B32_12]] ; GFX12-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[COPY18]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_13:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY34]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_13:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY34]], implicit $exec ; GFX12-NEXT: $sgpr13 = COPY [[V_READFIRSTLANE_B32_13]] ; GFX12-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[COPY19]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_14:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY35]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_14:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY35]], implicit $exec ; GFX12-NEXT: $sgpr14 = COPY [[V_READFIRSTLANE_B32_14]] ; GFX12-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[COPY20]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec ; GFX12-NEXT: $sgpr15 = COPY [[V_READFIRSTLANE_B32_15]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -847,7 +847,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1(<4 x i32> inreg %rsrc) { ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -863,7 +863,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1(<4 x i32> inreg %rsrc) { ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -878,7 +878,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -893,7 +893,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1(<4 x i32> inreg %rsrc) { ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1, i32 0) @@ -912,7 +912,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_4(<4 x i32> inreg %rsrc) { ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 1 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -927,7 +927,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_4(<4 x i32> inreg %rsrc) { ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 1 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -942,7 +942,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_4(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 4, 1 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -957,7 +957,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_4(<4 x i32> inreg %rsrc) { ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 4, 1 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 4, i32 1) @@ -977,7 +977,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_255(<4 x i32> inreg %rsrc) { ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -993,7 +993,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_255(<4 x i32> inreg %rsrc) { ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1008,7 +1008,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_255(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1023,7 +1023,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_255(<4 x i32> inreg %rsrc) { ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 255, i32 0) @@ -1042,7 +1042,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_256(<4 x i32> inreg %rsrc) { ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 64, 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1057,7 +1057,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_256(<4 x i32> inreg %rsrc) { ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 64, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1072,7 +1072,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_256(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 256, 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1087,7 +1087,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_256(<4 x i32> inreg %rsrc) { ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 256, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 256, i32 0) @@ -1106,7 +1106,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1020(<4 x i32> inreg %rsrc) { ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1121,7 +1121,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1020(<4 x i32> inreg %rsrc) { ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1136,7 +1136,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1020(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1020, 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1151,7 +1151,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1020(<4 x i32> inreg %rsrc) { ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1020, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1020, i32 0) @@ -1171,7 +1171,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1023(<4 x i32> inreg %rsrc) { ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1187,7 +1187,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1023(<4 x i32> inreg %rsrc) { ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1202,7 +1202,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1023(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1023, 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1217,7 +1217,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1023(<4 x i32> inreg %rsrc) { ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1023, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1023, i32 0) @@ -1237,7 +1237,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1024(<4 x i32> inreg %rsrc) { ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1252,7 +1252,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1024(<4 x i32> inreg %rsrc) { ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 256, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1267,7 +1267,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1024(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1024, 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1282,7 +1282,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1024(<4 x i32> inreg %rsrc) { ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1024, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1024, i32 0) @@ -1302,7 +1302,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1025(<4 x i32> inreg %rsrc) { ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1025 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1318,7 +1318,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1025(<4 x i32> inreg %rsrc) { ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1025 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1333,7 +1333,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1025(<4 x i32> inreg %rsrc) { ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1025, 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1348,7 +1348,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1025(<4 x i32> inreg %rsrc) { ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1025, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1025, i32 0) @@ -1368,7 +1368,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1384,7 +1384,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1400,7 +1400,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1416,7 +1416,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1, i32 0) @@ -1436,7 +1436,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) { ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1451,7 +1451,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) { ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073741823, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1467,7 +1467,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1483,7 +1483,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) { ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -4, i32 0) @@ -1503,7 +1503,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) { ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1518,7 +1518,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) { ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073741822, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1534,7 +1534,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1550,7 +1550,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) { ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -8, i32 0) @@ -1570,7 +1570,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) { ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1585,7 +1585,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) { ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 536870912, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1601,7 +1601,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1617,7 +1617,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) { ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -2147483648, i32 0) @@ -1637,7 +1637,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_bit30(<4 x i32> inreg %desc) ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741824 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 1 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1652,7 +1652,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_bit30(<4 x i32> inreg %desc) ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 268435456, 1 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1668,7 +1668,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_bit30(<4 x i32> inreg %desc) ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741824 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 1 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1684,7 +1684,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_bit30(<4 x i32> inreg %desc) ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741824 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 1 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1073741824, i32 1) @@ -1704,7 +1704,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit29(<4 x i32> inreg %desc) { ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 536870912 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1719,7 +1719,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit29(<4 x i32> inreg %desc) { ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 134217728, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1735,7 +1735,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit29(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 536870912 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1751,7 +1751,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit29(<4 x i32> inreg %desc) { ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 536870912 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 536870912, i32 0) @@ -1771,7 +1771,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit21(<4 x i32> inreg %desc) { ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2097152 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1786,7 +1786,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit21(<4 x i32> inreg %desc) { ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 524288, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1802,7 +1802,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit21(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2097152 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1817,7 +1817,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit21(<4 x i32> inreg %desc) { ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 2097152, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 2097152, i32 0) @@ -1837,7 +1837,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit20(<4 x i32> inreg %desc) { ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1852,7 +1852,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit20(<4 x i32> inreg %desc) { ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 262144, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1868,7 +1868,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit20(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1883,7 +1883,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit20(<4 x i32> inreg %desc) { ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1048576, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1048576, i32 0) @@ -1903,7 +1903,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc) ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1048576 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1918,7 +1918,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc) ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073479680, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1934,7 +1934,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc) ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1048576 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1950,7 +1950,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc) ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1048576 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1048576, i32 0) @@ -1970,7 +1970,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit19(<4 x i32> inreg %desc) { ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 524288 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -1985,7 +1985,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit19(<4 x i32> inreg %desc) { ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 131072, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -2000,7 +2000,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit19(<4 x i32> inreg %desc) { ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 524288, 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -2015,7 +2015,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit19(<4 x i32> inreg %desc) { ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 524288, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 524288, i32 0) @@ -2035,7 +2035,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc) ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -524288 ; GFX6-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX6-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -2050,7 +2050,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc) ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073610752, 0 :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX7-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -2066,7 +2066,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc) ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -524288 ; GFX8-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0 :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 ; @@ -2082,7 +2082,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc) ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -524288 ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load (s32)) ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -524288, i32 0) @@ -3625,10 +3625,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -3673,10 +3673,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -3721,10 +3721,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -3768,10 +3768,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -3819,10 +3819,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -3865,10 +3865,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -3911,10 +3911,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -3957,10 +3957,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4013,10 +4013,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4063,10 +4063,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4113,10 +4113,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4159,10 +4159,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4211,10 +4211,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4257,10 +4257,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4303,10 +4303,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4348,10 +4348,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4401,10 +4401,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4449,10 +4449,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4495,10 +4495,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4540,10 +4540,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4592,10 +4592,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4655,10 +4655,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4718,10 +4718,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4781,10 +4781,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4855,10 +4855,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4922,10 +4922,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -4989,10 +4989,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5052,10 +5052,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5124,10 +5124,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5191,10 +5191,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5258,10 +5258,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5321,10 +5321,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5390,10 +5390,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5454,10 +5454,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5518,10 +5518,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5581,10 +5581,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5650,10 +5650,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5714,10 +5714,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5778,10 +5778,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5841,10 +5841,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5910,10 +5910,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -5974,10 +5974,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -6038,10 +6038,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -6101,10 +6101,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -6169,10 +6169,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX6-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -6232,10 +6232,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -6295,10 +6295,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -6357,10 +6357,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll index ab720ce8f942c..abee7de151fd4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll @@ -202,10 +202,10 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -214,7 +214,7 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -258,10 +258,10 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -270,7 +270,7 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -320,10 +320,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -332,7 +332,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -375,10 +375,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -387,7 +387,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll index f9f70ecadfe60..1c00ffbf21abb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll @@ -119,10 +119,10 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -131,7 +131,7 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -179,10 +179,10 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -191,7 +191,7 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -245,10 +245,10 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -257,7 +257,7 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -303,10 +303,10 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -315,7 +315,7 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -414,9 +414,9 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__s ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1 ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0 ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1 - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ; @@ -444,9 +444,9 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__s ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1 ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0 ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1 - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -535,10 +535,10 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; GFX8-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 @@ -547,7 +547,7 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -570,9 +570,9 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0 ; GFX8-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1 - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ; @@ -604,10 +604,10 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 @@ -616,7 +616,7 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -639,9 +639,9 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0 ; GFX12-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1 - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -679,10 +679,10 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; GFX8-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 @@ -691,7 +691,7 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -741,10 +741,10 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 @@ -753,7 +753,7 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -803,9 +803,9 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__s ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1 ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0 ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1 - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ; @@ -833,9 +833,9 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__s ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1 ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0 ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1 - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %voffset = add i32 %voffset.base, 4095 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll index 8589fe9fd056d..c0027642655a6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -173,10 +173,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -185,7 +185,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -228,10 +228,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -240,7 +240,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -286,10 +286,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -298,7 +298,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -338,10 +338,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -350,7 +350,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll index 870588014cd29..98a2780e03b81 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll @@ -234,10 +234,10 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -246,7 +246,7 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -309,10 +309,10 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -321,7 +321,7 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -366,10 +366,10 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -378,7 +378,7 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll index 06bd45a45cced..c2ab42b08f477 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll @@ -207,10 +207,10 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -219,7 +219,7 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -268,10 +268,10 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -280,7 +280,7 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll index 94ce8aac8a4c6..588b0204619f0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -329,10 +329,10 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -341,7 +341,7 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -383,10 +383,10 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -395,7 +395,7 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll index f62a15d470afd..692381008e6fb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll @@ -212,10 +212,10 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -224,7 +224,7 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -267,10 +267,10 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -279,7 +279,7 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -322,10 +322,10 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -334,7 +334,7 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll index 8a395f0e73222..8a104e1fbfc83 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll @@ -195,10 +195,10 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -207,7 +207,7 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -250,10 +250,10 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -262,7 +262,7 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll index b89ed46ba0550..210c3bb50cb15 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll @@ -200,10 +200,10 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX8-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 ; GFX8-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 @@ -212,7 +212,7 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX8-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec ; GFX8-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -259,10 +259,10 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 @@ -271,7 +271,7 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll index 2e7323068d108..cc937f497d2ca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll @@ -123,10 +123,10 @@ define amdgpu_ps float @struct_ptr_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__s ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -135,7 +135,7 @@ define amdgpu_ps float @struct_ptr_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__s ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -185,10 +185,10 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rs ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -197,7 +197,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rs ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll index e8e6cab4edbe8..fb67ddaa2fb40 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll @@ -78,10 +78,10 @@ define amdgpu_ps float @struct_ptr_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -90,7 +90,7 @@ define amdgpu_ps float @struct_ptr_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -144,10 +144,10 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgp ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -156,7 +156,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgp ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -234,9 +234,9 @@ define amdgpu_ps double @struct_ptr_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cm ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -302,10 +302,10 @@ define amdgpu_ps double @struct_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cm ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 @@ -314,7 +314,7 @@ define amdgpu_ps double @struct_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cm ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -337,9 +337,9 @@ define amdgpu_ps double @struct_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cm ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0 ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -377,10 +377,10 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgp ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1 ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3 @@ -389,7 +389,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgp ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -439,9 +439,9 @@ define amdgpu_ps double @struct_ptr_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cm ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0 ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %voffset = add i32 %voffset.base, 4095 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll index 54657982493f7..18568aaa02634 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll @@ -173,10 +173,10 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -185,7 +185,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -228,10 +228,10 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -240,7 +240,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -286,10 +286,10 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -298,7 +298,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -338,10 +338,10 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -350,7 +350,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll index 6c0319ef570d6..bc4bd34985372 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll @@ -179,10 +179,10 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -191,7 +191,7 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -254,10 +254,10 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -266,7 +266,7 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll index 1e3f94a5e39cb..caaa76569ec89 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll @@ -123,10 +123,10 @@ define amdgpu_ps <4 x float> @struct_ptr_buffer_load_format_v4f32__vpr_rsrc__sgp ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -135,7 +135,7 @@ define amdgpu_ps <4 x float> @struct_ptr_buffer_load_format_v4f32__vpr_rsrc__sgp ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll index 66c62e9ce8a9c..95789b5ac49ed 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll @@ -193,10 +193,10 @@ define amdgpu_ps float @struct_ptr_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -205,7 +205,7 @@ define amdgpu_ps float @struct_ptr_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll index 25fe7d2877ce3..fe2b04841e332 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll @@ -158,10 +158,10 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f16__sgpr_val__vgpr_rsrc__ ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -170,7 +170,7 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f16__sgpr_val__vgpr_rsrc__ ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -213,10 +213,10 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f16__sgpr_val__vgpr_rsrc__ ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -225,7 +225,7 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f16__sgpr_val__vgpr_rsrc__ ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll index 3a4c258537814..3c5c337c6912b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll @@ -117,10 +117,10 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f32__sgpr_val__vgpr_rsrc__ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -129,7 +129,7 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f32__sgpr_val__vgpr_rsrc__ ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll index 2e0a12b9d969c..a18d0c2165275 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll @@ -122,10 +122,10 @@ define amdgpu_ps void @struct_ptr_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_v ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 @@ -134,7 +134,7 @@ define amdgpu_ps void @struct_ptr_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_v ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll index 1a57c2e77bddf..cae94487625e3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll @@ -220,10 +220,10 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -232,7 +232,7 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -277,10 +277,10 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -289,7 +289,7 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll index 63143ed718054..b08b46f20fc9e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll @@ -145,10 +145,10 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -157,7 +157,7 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll index f270f87aae66d..87c1e7b471271 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -293,10 +293,10 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -305,7 +305,7 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -350,10 +350,10 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -362,7 +362,7 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; GFX12-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; GFX12-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GFX12-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -407,10 +407,10 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -419,7 +419,7 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll index 7d3ecd363befb..23468c29ff79a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll @@ -247,10 +247,10 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -259,7 +259,7 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -308,10 +308,10 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-GFX12-NEXT: bb.2: ; CHECK-GFX12-NEXT: successors: %bb.3(0x80000000) ; CHECK-GFX12-NEXT: {{ $}} - ; CHECK-GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec - ; CHECK-GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec - ; CHECK-GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec - ; CHECK-GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; CHECK-GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec ; CHECK-GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK-GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; CHECK-GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 @@ -320,7 +320,7 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec ; CHECK-GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec ; CHECK-GFX12-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; CHECK-GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK-GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK-GFX12-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; CHECK-GFX12-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir index 4d611c15c868f..72f7ac3f0bf38 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir @@ -20,7 +20,7 @@ body: | ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]] ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]] ; GFX9-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY3]], 8, [[COPY3]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MAD_MIX_F32_]], implicit $exec + ; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MAD_MIX_F32_]], implicit $exec ; GFX9-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 %0:sgpr(s32) = COPY $sgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll index 2c71366772fc9..31526bcfead4e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -832,6 +832,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(ptr addrspace(1) %p ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967296 %val = load volatile float, ptr addrspace(1) %gep @@ -865,6 +866,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(ptr addrspace(1) %p ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967297 %val = load volatile float, ptr addrspace(1) %gep @@ -979,6 +981,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(ptr addrspace(1) %ptr, i ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset %val = load volatile float, ptr addrspace(1) %gep @@ -1017,6 +1020,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(ptr addrspace( ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset %gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 256 @@ -1056,6 +1060,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(ptr addrspace( ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 256 %gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 %soffset @@ -1099,6 +1104,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset(ptr addrspace(1) inreg % ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr float, ptr addrspace(1) %ptr, i32 %voffset %val = load volatile float, ptr addrspace(1) %gep @@ -1143,6 +1149,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset_offset4095(ptr addrspace ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr float, ptr addrspace(1) %ptr, i32 %voffset %gep1 = getelementptr float, ptr addrspace(1) %gep0, i64 4095 @@ -1187,6 +1194,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(ptr addrspace ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 4095 %gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 %voffset @@ -1359,6 +1367,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace( ; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst @@ -1411,6 +1420,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in ; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst @@ -1590,6 +1600,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1) ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -1643,6 +1654,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 7c6daf769aec2..ba2af13338be6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -1072,14 +1072,17 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v2, v11 ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6] +; GFX12-NEXT: s_wait_alu 0xf1fd ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i128 %num, %den ret i128 %result @@ -2433,71 +2436,83 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0 ; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mov_b32_e32 v20, v22 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mov_b32_e32 v19, v22 ; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 ; GFX12-NEXT: v_mov_b32_e32 v20, v18 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25] ; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11 ; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 ; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13 ; GFX12-NEXT: v_mov_b32_e32 v13, v1 ; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15] ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 ; GFX12-NEXT: v_mov_b32_e32 v14, v21 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 ; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 ; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4 ; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11] ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13] +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4 @@ -2505,11 +2520,13 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3 ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] +; GFX12-NEXT: s_wait_alu 0xf1fd ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i256 %num, %den ret i256 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir index 3d016c143e706..bf155eefe6129 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir @@ -69,10 +69,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.2 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) @@ -125,10 +125,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %8, %bb.2 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.append.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.append.mir index 70ca545024843..f050616d4e626 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.append.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.append.mir @@ -30,7 +30,7 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(p3) = COPY $vgpr0 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(p3) = V_READFIRSTLANE_B32 [[COPY]](p3), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(p3) = V_READFIRSTLANE_B32 [[COPY]](p3), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.append), [[V_READFIRSTLANE_B32_]](p3), 0 %0:_(p3) = COPY $vgpr0 %1:_(s32) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.append), %0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.consume.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.consume.mir index 14cadc90db749..071fdc8897a00 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.consume.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.consume.mir @@ -30,7 +30,7 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(p3) = COPY $vgpr0 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(p3) = V_READFIRSTLANE_B32 [[COPY]](p3), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(p3) = V_READFIRSTLANE_B32 [[COPY]](p3), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.consume), [[V_READFIRSTLANE_B32_]](p3), 0 %0:_(p3) = COPY $vgpr0 %1:_(s32) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.consume), %0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.init.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.init.mir index facf7717ce5a0..388edcb16a7a7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.init.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.init.mir @@ -36,7 +36,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.init), [[COPY2]](s32), [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $vgpr0 @@ -75,7 +75,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.init), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.sema.v.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.sema.v.mir index ad575fe813755..7fdba6490f3ec 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.sema.v.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.sema.v.mir @@ -31,9 +31,8 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec ; CHECK-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.sema.v), [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $vgpr0 G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.sema.v), %0 ... - diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.ordered.add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.ordered.add.mir index 516e842d1bf11..655155dc99087 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.ordered.add.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.ordered.add.mir @@ -34,7 +34,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.ordered.add), [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), 0, 0, 0, 0, 0, 0 %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $sgpr0 @@ -53,7 +53,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.ordered.add), [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), 0, 0, 0, 0, 0, 0 %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.ordered.swap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.ordered.swap.mir index c8cd2f850c1ad..40a1183098245 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.ordered.swap.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.ordered.swap.mir @@ -34,7 +34,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.ordered.swap), [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), 0, 0, 0, 0, 0, 0 %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $sgpr0 @@ -53,7 +53,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.ordered.swap), [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), 0, 0, 0, 0, 0, 0 %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll index c561bd85d070a..774b9c3972f9d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll @@ -122,14 +122,14 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.3 ; FAST-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) - ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec ; FAST-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; FAST-NEXT: [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) ; FAST-NEXT: [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<8 x s32>) @@ -184,14 +184,14 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; GREEDY-NEXT: [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) ; GREEDY-NEXT: [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<8 x s32>) @@ -253,14 +253,14 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %18, %bb.3 ; FAST-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) - ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec ; FAST-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; FAST-NEXT: [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) ; FAST-NEXT: [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<8 x s32>) @@ -316,14 +316,14 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %18, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec ; GREEDY-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; GREEDY-NEXT: [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) ; GREEDY-NEXT: [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<8 x s32>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll index bd3d5f29f11f9..3c4e8d411d9be 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll @@ -143,14 +143,14 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3 ; FAST-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) - ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec ; FAST-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; FAST-NEXT: [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) ; FAST-NEXT: [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<8 x s32>) @@ -209,14 +209,14 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec ; GREEDY-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; GREEDY-NEXT: [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) ; GREEDY-NEXT: [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<8 x s32>) @@ -281,10 +281,10 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3 ; FAST-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) - ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; FAST-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; FAST-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) ; FAST-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<4 x s32>) @@ -339,10 +339,10 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GREEDY-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) ; GREEDY-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<4 x s32>) @@ -403,14 +403,14 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3 ; FAST-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) - ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec ; FAST-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; FAST-NEXT: [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) ; FAST-NEXT: [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<8 x s32>) @@ -422,10 +422,10 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; FAST-NEXT: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV15]](s64), [[UV11]] ; FAST-NEXT: [[AND2:%[0-9]+]]:vcc(s1) = G_AND [[AND1]], [[ICMP3]] ; FAST-NEXT: [[UV16:%[0-9]+]]:vgpr_32(s32), [[UV17:%[0-9]+]]:vgpr_32(s32), [[UV18:%[0-9]+]]:vgpr_32(s32), [[UV19:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) - ; FAST-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV16]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV17]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV18]](s32), implicit $exec - ; FAST-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV19]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV16]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV17]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV18]](s32), implicit $exec + ; FAST-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV19]](s32), implicit $exec ; FAST-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32) ; FAST-NEXT: [[UV20:%[0-9]+]]:vgpr(s64), [[UV21:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) ; FAST-NEXT: [[UV22:%[0-9]+]]:sgpr(s64), [[UV23:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR3]](<4 x s32>) @@ -481,14 +481,14 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %22, %bb.3 ; GREEDY-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec ; GREEDY-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; GREEDY-NEXT: [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) ; GREEDY-NEXT: [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<8 x s32>) @@ -500,10 +500,10 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; GREEDY-NEXT: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV15]](s64), [[UV11]] ; GREEDY-NEXT: [[AND2:%[0-9]+]]:vcc(s1) = G_AND [[AND1]], [[ICMP3]] ; GREEDY-NEXT: [[UV16:%[0-9]+]]:vgpr_32(s32), [[UV17:%[0-9]+]]:vgpr_32(s32), [[UV18:%[0-9]+]]:vgpr_32(s32), [[UV19:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV16]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV17]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV18]](s32), implicit $exec - ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV19]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV16]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV17]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV18]](s32), implicit $exec + ; GREEDY-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV19]](s32), implicit $exec ; GREEDY-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32) ; GREEDY-NEXT: [[UV20:%[0-9]+]]:vgpr(s64), [[UV21:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) ; GREEDY-NEXT: [[UV22:%[0-9]+]]:sgpr(s64), [[UV23:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR3]](<4 x s32>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.mov.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.mov.mir index 299e8fc389984..fed0799a6c784 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.mov.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.mov.mir @@ -34,7 +34,7 @@ body: | ; CHECK: liveins: $sgpr0, $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.interp.mov), 0, 1, 1, [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.interp.mov), 0, 1, 1, %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p1.f16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p1.f16.mir index b1fc3181c0afc..1dac8d2ae3482 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p1.f16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p1.f16.mir @@ -37,7 +37,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.interp.p1.f16), [[COPY2]](s32), 1, 1, 1, [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p1.mir index 6567c159a745f..150fe4e2cd062 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p1.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p1.mir @@ -55,7 +55,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.interp.p1), [[COPY2]](s32), 1, 1, [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $vgpr0 @@ -75,7 +75,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.interp.p1), [[COPY]](s32), 1, 1, [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p2.f16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p2.f16.mir index 27913ec3e85ec..81ee9ea80a0a9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p2.f16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p2.f16.mir @@ -42,7 +42,7 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.interp.p2.f16), [[COPY3]](s32), [[COPY4]](s32), 1, 1, 1, [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p2.mir index 353bd7207a162..31aa855bae408 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p2.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.interp.p2.mir @@ -41,7 +41,7 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.interp.p2), [[COPY3]](s32), [[COPY4]](s32), 1, 1, [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.direct.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.direct.load.mir index f39d13f8b4868..3936abfcead29 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.direct.load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.direct.load.mir @@ -31,7 +31,7 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.lds.direct.load), [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.lds.direct.load), %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.param.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.param.load.mir index 9b5c79f355da7..f90f589ed0f58 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.param.load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.param.load.mir @@ -31,7 +31,7 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.lds.param.load), 1, 1, [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.lds.param.load), 1, 1, %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll index b9bde7aecd34b..3df5a16fa74d8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll @@ -71,10 +71,10 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -126,7 +126,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY5]] ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec @@ -174,17 +174,17 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] ; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]] ; CHECK-NEXT: [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]] ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll index 88fcc593dfaf3..840b1e8f914ef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll @@ -71,10 +71,10 @@ define amdgpu_ps float @raw_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -126,7 +126,7 @@ define amdgpu_ps float @raw_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY5]] ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec @@ -174,17 +174,17 @@ define amdgpu_ps float @raw_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %20, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] ; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]] ; CHECK-NEXT: [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]] ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir index c02b8ddb3cdea..3065e79fdd01a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir @@ -51,7 +51,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readlane), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -71,7 +71,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readlane), [[COPY2]](s32), [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $vgpr0 @@ -92,7 +92,7 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr(s32) = COPY $agpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY3]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readlane), [[COPY2]](s32), [[V_READFIRSTLANE_B32_]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[INT]](s32) %0:_(s32) = COPY $agpr0 @@ -134,7 +134,7 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr(s32) = COPY $agpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY3]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readlane), [[COPY2]](s32), [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $agpr0 @@ -154,7 +154,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr(s32) = COPY $agpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readlane), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $agpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll index fe6f77d1cd3e3..78c608e25d827 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -3095,10 +3095,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX7-NEXT: bb.2: ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -3141,10 +3141,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX12-NEXT: bb.2: ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -3285,10 +3285,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX7-NEXT: bb.2: ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %16, %bb.3 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -3332,10 +3332,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX12-NEXT: bb.2: ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -3480,10 +3480,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX7-NEXT: bb.2: ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %17, %bb.3 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -3527,10 +3527,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX12-NEXT: bb.2: ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -3669,10 +3669,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX7-NEXT: bb.2: ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -3715,10 +3715,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX12-NEXT: bb.2: ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -3856,10 +3856,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX7-NEXT: bb.2: ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -3902,10 +3902,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX12-NEXT: bb.2: ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -4067,10 +4067,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX7-NEXT: bb.2: ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -4124,10 +4124,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX12-NEXT: bb.2: ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -4303,10 +4303,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX7-NEXT: bb.2: ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -4360,10 +4360,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX12-NEXT: bb.2: ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -4537,10 +4537,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX7-NEXT: bb.2: ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -4594,10 +4594,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX12-NEXT: bb.2: ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -4768,10 +4768,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX7-NEXT: bb.2: ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -4826,10 +4826,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX12-NEXT: bb.2: ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -5000,10 +5000,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX7-NEXT: bb.2: ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -5058,10 +5058,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX12-NEXT: bb.2: ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -5232,10 +5232,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX7-NEXT: bb.2: ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -5290,10 +5290,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX12-NEXT: bb.2: ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -5461,10 +5461,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX7-NEXT: bb.2: ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -5517,10 +5517,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX12-NEXT: bb.2: ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsg.mir index d46a2c76c795f..3e9d3297091f6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsg.mir @@ -29,7 +29,7 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), 0, [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $vgpr0 G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), 0, %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsghalt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsghalt.mir index f32ad5c225a32..df490b26703fc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsghalt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsghalt.mir @@ -29,7 +29,7 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsghalt), 0, [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $vgpr0 G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsghalt), 0, %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll index e3b04661b15f8..0df8e68e7093c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll @@ -69,10 +69,10 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -123,7 +123,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgp ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY6]] ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec @@ -170,17 +170,17 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] ; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]] ; CHECK-NEXT: [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]] ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll index a4d8410249d90..4dc0778a08884 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll @@ -71,10 +71,10 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -125,7 +125,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY7]] ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec @@ -172,17 +172,17 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] ; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_4]](s32), [[COPY7]] ; CHECK-NEXT: [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]] ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll index 4c67fe88c3c32..9acc9d0be294e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll @@ -69,10 +69,10 @@ define amdgpu_ps float @struct_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -123,7 +123,7 @@ define amdgpu_ps float @struct_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY6]] ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec @@ -170,17 +170,17 @@ define amdgpu_ps float @struct_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] ; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]] ; CHECK-NEXT: [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]] ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll index ccbdc4eaebfcc..37880233b2733 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll @@ -71,10 +71,10 @@ define amdgpu_ps void @struct_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) @@ -125,7 +125,7 @@ define amdgpu_ps void @struct_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY7]] ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec @@ -172,17 +172,17 @@ define amdgpu_ps void @struct_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %19, %bb.3 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]] ; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_4]](s32), [[COPY7]] ; CHECK-NEXT: [[AND1:%[0-9]+]]:vcc(s1) = G_AND [[AND]], [[ICMP2]] ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND1]](s1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir index 9cd2219faf598..fe09e7555d41c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir @@ -56,7 +56,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.writelane), [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), [[COPY2]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $sgpr0 @@ -77,8 +77,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.writelane), [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[COPY2]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -99,7 +99,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.writelane), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mad_64_32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mad_64_32.mir index a8dad974ca9c1..cce4beacafdb2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mad_64_32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mad_64_32.mir @@ -24,7 +24,7 @@ body: | ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; GFX8-NEXT: [[UMULH:%[0-9]+]]:vgpr_32(s32) = G_UMULH [[COPY4]], [[COPY5]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UMULH]](s32), implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UMULH]](s32), implicit $exec ; GFX8-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) ; GFX8-NEXT: [[UADDO:%[0-9]+]]:sgpr(s32), [[UADDO1:%[0-9]+]]:sgpr(s32) = G_UADDO [[MUL]], [[UV]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:sgpr(s32), [[UADDE1:%[0-9]+]]:sgpr(s32) = G_UADDE [[V_READFIRSTLANE_B32_]], [[UV1]], [[UADDO1]] @@ -316,7 +316,7 @@ body: | ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; GFX8-NEXT: [[SMULH:%[0-9]+]]:vgpr_32(s32) = G_SMULH [[COPY4]], [[COPY5]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[SMULH]](s32), implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[SMULH]](s32), implicit $exec ; GFX8-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[V_READFIRSTLANE_B32_]](s32), [[C]] ; GFX8-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) @@ -469,7 +469,7 @@ body: | ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; GFX8-NEXT: [[UMULH:%[0-9]+]]:vgpr_32(s32) = G_UMULH [[COPY2]], [[COPY3]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UMULH]](s32), implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UMULH]](s32), implicit $exec ; GFX8-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[MUL]](s32), [[V_READFIRSTLANE_B32_]](s32) ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C1]](s32) @@ -543,7 +543,7 @@ body: | ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; GFX8-NEXT: [[SMULH:%[0-9]+]]:vgpr_32(s32) = G_SMULH [[COPY2]], [[COPY3]] - ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[SMULH]](s32), implicit $exec + ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[SMULH]](s32), implicit $exec ; GFX8-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(slt), [[V_READFIRSTLANE_B32_]](s32), [[C1]] ; GFX8-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[MUL]](s32), [[V_READFIRSTLANE_B32_]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir index f70d650e9b319..b145a62318e30 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir @@ -31,7 +31,7 @@ body: | ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.2 - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[V_READFIRSTLANE_B32_]](s32), [[COPY1]] ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec @@ -83,14 +83,14 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %6, %bb.2 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32), [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32), [[UV6:%[0-9]+]]:vgpr_32(s32), [[UV7:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[COPY2]](<8 x s32>) - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV6]](s32), implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV7]](s32), implicit $exec ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; CHECK-NEXT: [[UV8:%[0-9]+]]:vgpr(s64), [[UV9:%[0-9]+]]:vgpr(s64), [[UV10:%[0-9]+]]:vgpr(s64), [[UV11:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[COPY2]](<8 x s32>) ; CHECK-NEXT: [[UV12:%[0-9]+]]:sgpr(s64), [[UV13:%[0-9]+]]:sgpr(s64), [[UV14:%[0-9]+]]:sgpr(s64), [[UV15:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir index bb4a86779724d..1b12977ea6f15 100644 --- a/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir +++ b/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir @@ -310,6 +310,38 @@ body: | ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX908-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec @@ -445,6 +477,38 @@ body: | ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64 ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec @@ -1428,6 +1492,38 @@ body: | ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX908-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX908-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec @@ -1564,6 +1660,38 @@ body: | ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX90A-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64 ; GFX90A-FLATSCR-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec @@ -2576,6 +2704,38 @@ body: | ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX908-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec @@ -2713,6 +2873,38 @@ body: | ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64 ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec @@ -3697,6 +3889,38 @@ body: | ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX908-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX908-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec @@ -3835,6 +4059,38 @@ body: | ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX90A-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64 ; GFX90A-FLATSCR-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec @@ -4847,6 +5103,38 @@ body: | ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX908-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec @@ -4986,6 +5274,38 @@ body: | ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64 ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec @@ -5971,6 +6291,38 @@ body: | ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX908-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX908-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec @@ -6111,6 +6463,38 @@ body: | ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX90A-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64 ; GFX90A-FLATSCR-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec @@ -7123,6 +7507,38 @@ body: | ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX908-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec @@ -7258,6 +7674,38 @@ body: | ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64 ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec @@ -8241,6 +8689,38 @@ body: | ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX908-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX908-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec @@ -8377,6 +8857,38 @@ body: | ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX90A-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64 ; GFX90A-FLATSCR-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec @@ -9388,12 +9900,44 @@ body: | ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX908-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1 ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store (s32) into %stack.1, addrspace 5) - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store (s32) into %stack.1 + 4, addrspace 5) ; GFX908-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; GFX908-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc @@ -9525,6 +10069,38 @@ body: | ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64 ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec @@ -10509,13 +11085,45 @@ body: | ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX908-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX908-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec ; GFX908-FLATSCR-NEXT: $vgpr1 = V_ADD_U32_e32 8200, $vgpr1, implicit $exec - ; GFX908-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1 + ; GFX908-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1 ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD $vgpr0, $vgpr1, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1 :: (store (s32) into %stack.1, addrspace 5) - ; GFX908-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD $vgpr0, $vgpr1, 4, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1 :: (store (s32) into %stack.1 + 4, addrspace 5) ; GFX908-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) ; GFX908-FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc @@ -10647,6 +11255,38 @@ body: | ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX90A-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64 ; GFX90A-FLATSCR-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec @@ -11657,14 +12297,46 @@ body: | ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX908-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX908-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.1, addrspace 5) ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1 + 4, addrspace 5) - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2 ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.1 + 8, addrspace 5) ; GFX908-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; GFX908-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc @@ -11796,6 +12468,38 @@ body: | ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64 ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec @@ -12781,15 +13485,47 @@ body: | ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX908-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX908-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX908-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec ; GFX908-FLATSCR-NEXT: $vgpr1 = V_ADD_U32_e32 8200, $vgpr1, implicit $exec - ; GFX908-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX908-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD $vgpr0, $vgpr1, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.1, addrspace 5) ; GFX908-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD $vgpr0, $vgpr1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1 + 4, addrspace 5) - ; GFX908-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX908-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2 ; GFX908-FLATSCR-NEXT: SCRATCH_STORE_DWORD $vgpr0, $vgpr1, 8, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.1 + 8, addrspace 5) ; GFX908-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) ; GFX908-FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc @@ -12921,6 +13657,38 @@ body: | ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX90A-FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec ; GFX90A-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_register_mask $agpr32, $vgpr0, 32, $exec, 64 ; GFX90A-FLATSCR-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll index d316e10037757..328aae51e56fa 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -1,20 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s -; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefix=HSA %s declare void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) nocapture, ptr addrspace(4) nocapture, i32, i1) #0 -@lds.i32 = unnamed_addr addrspace(3) global i32 undef, align 4 -@lds.arr = unnamed_addr addrspace(3) global [256 x i32] undef, align 4 +@lds.i32 = unnamed_addr addrspace(3) global i32 poison, align 4 +@lds.arr = unnamed_addr addrspace(3) global [256 x i32] poison, align 4 -@global.i32 = unnamed_addr addrspace(1) global i32 undef, align 4 -@global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4 +@global.i32 = unnamed_addr addrspace(1) global i32 poison, align 4 +@global.arr = unnamed_addr addrspace(1) global [256 x i32] poison, align 4 ;. -; HSA: @lds.i32 = unnamed_addr addrspace(3) global i32 undef, align 4 -; HSA: @lds.arr = unnamed_addr addrspace(3) global [256 x i32] undef, align 4 -; HSA: @global.i32 = unnamed_addr addrspace(1) global i32 undef, align 4 -; HSA: @global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4 +; HSA: @lds.i32 = unnamed_addr addrspace(3) global i32 poison, align 4 +; HSA: @lds.arr = unnamed_addr addrspace(3) global [256 x i32] poison, align 4 +; HSA: @global.i32 = unnamed_addr addrspace(1) global i32 poison, align 4 +; HSA: @global.arr = unnamed_addr addrspace(1) global [256 x i32] poison, align 4 ;. define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 { ; HSA-LABEL: define {{[^@]+}}@store_cast_0_flat_to_group_addrspacecast @@ -27,45 +26,30 @@ define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 { } define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)), align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast -; ATTRIBUTOR_HSA-SAME: () #[[ATTR2:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)), align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast +; HSA-SAME: () #[[ATTR2:[0-9]+]] { +; HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)), align 4 +; HSA-NEXT: ret void ; store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)) ret void } define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.i32 to ptr addrspace(4)), align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat -; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { -; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.i32 to ptr addrspace(4)), align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat +; HSA-SAME: () #[[ATTR2]] { +; HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.i32 to ptr addrspace(4)), align 4 +; HSA-NEXT: ret void ; store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.i32 to ptr addrspace(4)) ret void } define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat -; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { -; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat +; HSA-SAME: () #[[ATTR2]] { +; HSA-NEXT: store i32 7, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 +; HSA-NEXT: ret void ; store i32 7, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) ret void @@ -92,17 +76,11 @@ define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 { } define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 -; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat +; HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { +; HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 +; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4 +; HSA-NEXT: ret void ; %val = load i32, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) store i32 %val, ptr addrspace(1) %out @@ -110,17 +88,11 @@ define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(ptr addrspace } define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 1 seq_cst, align 4 -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 1 seq_cst, align 4 -; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat +; HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { +; HSA-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 1 seq_cst, align 4 +; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4 +; HSA-NEXT: ret void ; %val = atomicrmw add ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 1 seq_cst store i32 %val, ptr addrspace(1) %out @@ -128,19 +100,12 @@ define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(ptr addr } define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = cmpxchg ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4 -; AKF_HSA-NEXT: [[VAL0:%.*]] = extractvalue { i32, i1 } [[VAL]], 0 -; AKF_HSA-NEXT: store i32 [[VAL0]], ptr addrspace(1) [[OUT]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = cmpxchg ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4 -; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = extractvalue { i32, i1 } [[VAL]], 0 -; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL0]], ptr addrspace(1) [[OUT]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat +; HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { +; HSA-NEXT: [[VAL:%.*]] = cmpxchg ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4 +; HSA-NEXT: [[VAL0:%.*]] = extractvalue { i32, i1 } [[VAL]], 0 +; HSA-NEXT: store i32 [[VAL0]], ptr addrspace(1) [[OUT]], align 4 +; HSA-NEXT: ret void ; %val = cmpxchg ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 @@ -149,15 +114,10 @@ define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(ptr addrsp } define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: call void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) align 4 [[OUT]], ptr addrspace(4) align 4 getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 32, i1 false) -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { -; ATTRIBUTOR_HSA-NEXT: call void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) align 4 [[OUT]], ptr addrspace(4) align 4 getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 32, i1 false) -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat +; HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { +; HSA-NEXT: call void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) align 4 [[OUT]], ptr addrspace(4) align 4 getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 32, i1 false) +; HSA-NEXT: ret void ; call void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) align 4 %out, ptr addrspace(4) align 4 getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 32, i1 false) ret void @@ -165,15 +125,10 @@ define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(ptr addrspa ; Can't just search the pointer value define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: store ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), ptr addrspace(1) [[OUT]], align 8 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { -; ATTRIBUTOR_HSA-NEXT: store ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), ptr addrspace(1) [[OUT]], align 8 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat +; HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { +; HSA-NEXT: store ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), ptr addrspace(1) [[OUT]], align 8 +; HSA-NEXT: ret void ; store ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), ptr addrspace(1) %out ret void @@ -181,15 +136,10 @@ define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(ptr addr ; Can't just search pointer types define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { -; ATTRIBUTOR_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat +; HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { +; HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8 +; HSA-NEXT: ret void ; store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) %out ret void @@ -197,28 +147,19 @@ define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat ; Cast group to flat, do GEP, cast back to group define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: store i32 7, ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)), align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group -; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { -; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)), align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group +; HSA-SAME: () #[[ATTR2]] { +; HSA-NEXT: store i32 7, ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)), align 4 +; HSA-NEXT: ret void ; store i32 7, ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)) ret void } define ptr addrspace(3) @ret_constant_cast_group_gv_gep_to_flat_to_group() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)) -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group -; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { -; ATTRIBUTOR_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)) +; HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group +; HSA-SAME: () #[[ATTR2]] { +; HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)) ; ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)) } @@ -226,17 +167,8 @@ define ptr addrspace(3) @ret_constant_cast_group_gv_gep_to_flat_to_group() #1 { attributes #0 = { argmemonly nounwind } attributes #1 = { nounwind } -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} -;. -; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -; AKF_HSA: attributes #[[ATTR1]] = { nounwind } -;. -; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -;. -; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. -; ATTRIBUTOR_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} +; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 6c83361b8cdc8..22a676664cffe 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -647,13 +647,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_add_f32_e32 v9, v9, v15 ; GFX908-NEXT: v_add_f32_e32 v10, v10, v12 ; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 -; GFX908-NEXT: s_mov_b64 s[20:21], -1 ; GFX908-NEXT: s_branch .LBB3_4 ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: s_mov_b64 s[20:21], s[16:17] ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GFX908-NEXT: s_cbranch_vccz .LBB3_4 ; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 +; GFX908-NEXT: s_mov_b64 s[20:21], -1 ; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard @@ -799,13 +799,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[26:27] ; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17] ; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] -; GFX90A-NEXT: s_mov_b64 s[20:21], -1 ; GFX90A-NEXT: s_branch .LBB3_4 ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: s_mov_b64 s[20:21], s[16:17] ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 ; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 +; GFX90A-NEXT: s_mov_b64 s[20:21], -1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll index cab8e0b8baaa5..c88bc3b22b020 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll @@ -1,12 +1,49 @@ -; RUN: llc -mtriple=amdgcn -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=GFX11 %s ; This test just checks that the compiler doesn't crash. -; CHECK-LABEL: {{^}}v32i8_to_v8i32: + define amdgpu_ps float @v32i8_to_v8i32(ptr addrspace(4) inreg) #0 { +; GCN-LABEL: v32i8_to_v8i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[0:1], 0x1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1] +; GCN-NEXT: ; return to shader part epilog +; +; VI-LABEL: v32i8_to_v8i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[0:1], 0x4 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1] +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v32i8_to_v8i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v32i8_to_v8i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x4 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s0 +; GFX11-NEXT: ; return to shader part epilog entry: %1 = load <32 x i8>, ptr addrspace(4) %0 %2 = bitcast <32 x i8> %1 to <8 x i32> @@ -16,9 +53,62 @@ entry: ret float %5 } -; CHECK-LABEL: {{^}}i8ptr_v16i8ptr: -; CHECK: s_endpgm define amdgpu_kernel void @i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN-LABEL: i8ptr_v16i8ptr: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: i8ptr_v16i8ptr: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: i8ptr_v16i8ptr: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: i8ptr_v16i8ptr: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm entry: %0 = load <16 x i8>, ptr addrspace(1) %in store <16 x i8> %0, ptr addrspace(1) %out @@ -26,6 +116,63 @@ entry: } define amdgpu_kernel void @f32_to_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: f32_to_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 2, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x20000, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: f32_to_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f32_e64 v2, s2, 1.0 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x20000, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: f32_to_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e64 v1, s2, 1.0 +; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: f32_to_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e64 v0, s2, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm %load = load float, ptr addrspace(1) %in, align 4 %fadd32 = fadd float %load, 1.0 %bc = bitcast float %fadd32 to <2 x i16> @@ -35,6 +182,63 @@ define amdgpu_kernel void @f32_to_v2i16(ptr addrspace(1) %out, ptr addrspace(1) } define amdgpu_kernel void @v2i16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v2i16_to_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s5, s4, 0xffff0000 +; GCN-NEXT: s_add_i32 s4, s4, 2 +; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: s_add_i32 s4, s4, 0x20000 +; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v2i16_to_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s0, s2, 0xffff0000 +; VI-NEXT: s_add_i32 s2, s2, 2 +; VI-NEXT: s_and_b32 s1, s2, 0xffff +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_add_i32 s0, s0, 0x20000 +; VI-NEXT: v_add_f32_e64 v2, s0, 1.0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v2i16_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v1, s2, 2 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v2i16_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, s2, 2 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm %load = load <2 x i16>, ptr addrspace(1) %in, align 4 %add.v2i16 = add <2 x i16> %load, %bc = bitcast <2 x i16> %add.v2i16 to float @@ -44,6 +248,67 @@ define amdgpu_kernel void @v2i16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) } define amdgpu_kernel void @f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: f32_to_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_add_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: f32_to_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0x4000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f32_e64 v3, s2, 1.0 +; VI-NEXT: v_add_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 2.0, v3 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: f32_to_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e64 v1, s2, 1.0 +; GFX9-NEXT: v_pk_add_f16 v1, v1, 2.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: f32_to_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e64 v0, s2, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm %load = load float, ptr addrspace(1) %in, align 4 %fadd32 = fadd float %load, 1.0 %bc = bitcast float %fadd32 to <2 x half> @@ -53,6 +318,68 @@ define amdgpu_kernel void @f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) } define amdgpu_kernel void @v2f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v2f16_to_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GCN-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GCN-NEXT: v_add_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v2f16_to_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0x4000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_add_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e64 v3, s2, 2.0 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v2f16_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v2f16_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, s2, 2.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm %load = load <2 x half>, ptr addrspace(1) %in, align 4 %add.v2f16 = fadd <2 x half> %load, %bc = bitcast <2 x half> %add.v2f16 to float @@ -62,6 +389,50 @@ define amdgpu_kernel void @v2f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) } define amdgpu_kernel void @v4i8_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v4i8_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v4i8_to_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v4i8_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v4i8_to_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %load = load <4 x i8>, ptr addrspace(1) %in, align 4 %bc = bitcast <4 x i8> %load to i32 store i32 %bc, ptr addrspace(1) %out, align 4 @@ -69,15 +440,112 @@ define amdgpu_kernel void @v4i8_to_i32(ptr addrspace(1) %out, ptr addrspace(1) % } define amdgpu_kernel void @i32_to_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: i32_to_v4i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: i32_to_v4i8: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: i32_to_v4i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: i32_to_v4i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %load = load i32, ptr addrspace(1) %in, align 4 %bc = bitcast i32 %load to <4 x i8> store <4 x i8> %bc, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}bitcast_v2i32_to_f64: -; CHECK: s_endpgm + define amdgpu_kernel void @bitcast_v2i32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN-LABEL: bitcast_v2i32_to_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_i32 s5, s5, 9 +; GCN-NEXT: s_add_i32 s4, s4, 4 +; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v2i32_to_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s3, s3, 9 +; VI-NEXT: s_add_i32 s2, s2, 4 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v2i32_to_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s3, s3, 9 +; GFX9-NEXT: s_add_i32 s2, s2, 4 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v2i32_to_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s3, s3, 9 +; GFX11-NEXT: s_add_i32 s2, s2, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %val = load <2 x i32>, ptr addrspace(1) %in, align 8 %add = add <2 x i32> %val, %bc = bitcast <2 x i32> %add to double @@ -86,9 +554,53 @@ define amdgpu_kernel void @bitcast_v2i32_to_f64(ptr addrspace(1) %out, ptr addrs ret void } -; CHECK-LABEL: {{^}}bitcast_f64_to_v2i32: -; CHECK: s_endpgm + define amdgpu_kernel void @bitcast_f64_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN-LABEL: bitcast_f64_to_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 4.0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_f64_to_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_f64_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_f64_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %val = load double, ptr addrspace(1) %in, align 8 %add = fadd double %val, 4.0 %bc = bitcast double %add to <2 x i32> @@ -96,8 +608,114 @@ define amdgpu_kernel void @bitcast_f64_to_v2i32(ptr addrspace(1) %out, ptr addrs ret void } -; CHECK-LABEL: {{^}}bitcast_v2i64_to_v2f64: + define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, ptr addrspace(1) %out, <2 x i64> %value) { +; GCN-LABEL: bitcast_v2i64_to_v2f64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s9, s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN-NEXT: s_mov_b32 s8, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s9, 0 +; GCN-NEXT: s_mov_b32 s9, s8 +; GCN-NEXT: s_mov_b32 s10, s8 +; GCN-NEXT: s_mov_b32 s11, s8 +; GCN-NEXT: s_cbranch_scc1 .LBB10_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] +; GCN-NEXT: .LBB10_2: ; %end +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v2i64_to_v2f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s11, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_mov_b32 s8, 0 +; VI-NEXT: s_mov_b32 s9, s8 +; VI-NEXT: s_mov_b32 s10, s8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s11, 0 +; VI-NEXT: s_mov_b32 s11, s8 +; VI-NEXT: s_cbranch_scc1 .LBB10_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b64 s[10:11], s[6:7] +; VI-NEXT: s_mov_b64 s[8:9], s[4:5] +; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v2i64_to_v2f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s11, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: s_cbranch_scc1 .LBB10_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v2i64_to_v2f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s11, s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s9, s8 +; GFX11-NEXT: s_mov_b32 s10, s8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_mov_b32 s11, s8 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_mov_b32 s4, s2 +; GFX11-NEXT: s_mov_b32 s5, s3 +; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX11-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_mov_b32_e32 v2, s10 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -112,8 +730,114 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v2f64_to_v2i64: + define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, ptr addrspace(1) %out, <2 x double> %value) { +; GCN-LABEL: bitcast_v2f64_to_v2i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s9, s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN-NEXT: s_mov_b32 s8, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s9, 0 +; GCN-NEXT: s_mov_b32 s9, s8 +; GCN-NEXT: s_mov_b32 s10, s8 +; GCN-NEXT: s_mov_b32 s11, s8 +; GCN-NEXT: s_cbranch_scc1 .LBB11_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] +; GCN-NEXT: .LBB11_2: ; %end +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v2f64_to_v2i64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s11, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_mov_b32 s8, 0 +; VI-NEXT: s_mov_b32 s9, s8 +; VI-NEXT: s_mov_b32 s10, s8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s11, 0 +; VI-NEXT: s_mov_b32 s11, s8 +; VI-NEXT: s_cbranch_scc1 .LBB11_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b64 s[10:11], s[6:7] +; VI-NEXT: s_mov_b64 s[8:9], s[4:5] +; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v2f64_to_v2i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s11, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: s_cbranch_scc1 .LBB11_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v2f64_to_v2i64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s11, s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s9, s8 +; GFX11-NEXT: s_mov_b32 s10, s8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_mov_b32 s11, s8 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_mov_b32 s4, s2 +; GFX11-NEXT: s_mov_b32 s5, s3 +; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX11-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_mov_b32_e32 v2, s10 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -128,8 +852,78 @@ end: ret void } -; CHECK-LABEL: {{^}}v4i16_to_f64: + define amdgpu_kernel void @v4i16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v4i16_to_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s6, s5, 0xffff0000 +; GCN-NEXT: s_add_i32 s5, s5, 4 +; GCN-NEXT: s_and_b32 s7, s4, 0xffff0000 +; GCN-NEXT: s_add_i32 s4, s4, 4 +; GCN-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_or_b32 s5, s6, s5 +; GCN-NEXT: s_or_b32 s4, s7, s4 +; GCN-NEXT: s_add_i32 s5, s5, 0x40000 +; GCN-NEXT: s_add_i32 s4, s4, 0x40000 +; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v4i16_to_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s4, s2, 0xffff0000 +; VI-NEXT: s_add_i32 s2, s2, 4 +; VI-NEXT: s_and_b32 s5, s3, 0xffff0000 +; VI-NEXT: s_add_i32 s3, s3, 4 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_or_b32 s3, s5, s3 +; VI-NEXT: s_or_b32 s2, s4, s2 +; VI-NEXT: s_add_i32 s3, s3, 0x40000 +; VI-NEXT: s_add_i32 s2, s2, 0x40000 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v4i16_to_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v4i16_to_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v1, s3, 4 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s2, 4 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(1) %in, align 4 %add.v4i16 = add <4 x i16> %load, %bc = bitcast <4 x i16> %add.v4i16 to double @@ -138,8 +932,87 @@ define amdgpu_kernel void @v4i16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) ret void } -; CHECK-LABEL: {{^}}v4f16_to_f64: + define amdgpu_kernel void @v4f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v4f16_to_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GCN-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, s5 +; GCN-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, s4 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, s5 +; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v4f16_to_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x4400 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s4, s3, 16 +; VI-NEXT: v_add_f16_e64 v1, s3, 4.0 +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_add_f16_e64 v2, s2, 4.0 +; VI-NEXT: v_add_f16_sdwa v3, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v4f16_to_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v4f16_to_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v1, s3, 4.0 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v0, s2, 4.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load <4 x half>, ptr addrspace(1) %in, align 4 %add.v4half = fadd <4 x half> %load, %bc = bitcast <4 x half> %add.v4half to double @@ -148,8 +1021,83 @@ define amdgpu_kernel void @v4f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) ret void } -; CHECK-LABEL: {{^}}f64_to_v4f16: + define amdgpu_kernel void @f64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: f64_to_v4f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_add_f32_e32 v3, 2.0, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v3 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: f64_to_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v4, 0x4000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 +; VI-NEXT: v_add_f16_sdwa v5, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 2.0, v1 +; VI-NEXT: v_add_f16_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 2.0, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: f64_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f64 v[0:1], s[4:5], 1.0 +; GFX9-NEXT: v_pk_add_f16 v1, v1, 2.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: f64_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_add_f16 v1, v1, 2.0 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load double, ptr addrspace(1) %in, align 4 %fadd32 = fadd double %load, 1.0 %bc = bitcast double %fadd32 to <4 x half> @@ -158,8 +1106,86 @@ define amdgpu_kernel void @f64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) ret void } -; CHECK-LABEL: {{^}}f64_to_v4i16: + define amdgpu_kernel void @f64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: f64_to_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: s_and_b32 s5, s4, 0xffff0000 +; GCN-NEXT: s_add_i32 s4, s4, 2 +; GCN-NEXT: s_and_b32 s6, s2, 0xffff0000 +; GCN-NEXT: s_add_i32 s2, s2, 2 +; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: s_or_b32 s2, s6, s2 +; GCN-NEXT: s_add_i32 s4, s4, 0x20000 +; GCN-NEXT: s_add_i32 s5, s2, 0x20000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: f64_to_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 +; VI-NEXT: v_readfirstlane_b32 s0, v1 +; VI-NEXT: v_readfirstlane_b32 s1, v0 +; VI-NEXT: s_and_b32 s2, s1, 0xffff0000 +; VI-NEXT: s_add_i32 s1, s1, 2 +; VI-NEXT: s_and_b32 s3, s0, 0xffff0000 +; VI-NEXT: s_add_i32 s0, s0, 2 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_or_b32 s0, s3, s0 +; VI-NEXT: s_or_b32 s1, s2, s1 +; VI-NEXT: s_add_i32 s0, s0, 0x20000 +; VI-NEXT: s_add_i32 s1, s1, 0x20000 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: f64_to_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f64 v[0:1], s[4:5], 1.0 +; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: f64_to_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load double, ptr addrspace(1) %in, align 4 %fadd32 = fadd double %load, 1.0 %bc = bitcast double %fadd32 to <4 x i16> @@ -168,8 +1194,86 @@ define amdgpu_kernel void @f64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ret void } -; CHECK-LABEL: {{^}}v4i16_to_i64: + define amdgpu_kernel void @v4i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v4i16_to_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s2, s5, 0xffff0000 +; GCN-NEXT: s_add_i32 s5, s5, 4 +; GCN-NEXT: s_and_b32 s6, s4, 0xffff0000 +; GCN-NEXT: s_add_i32 s4, s4, 4 +; GCN-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_or_b32 s2, s2, s5 +; GCN-NEXT: s_or_b32 s4, s6, s4 +; GCN-NEXT: s_add_i32 s2, s2, 0x40000 +; GCN-NEXT: s_add_i32 s4, s4, 0x40000 +; GCN-NEXT: s_add_u32 s4, s4, 1 +; GCN-NEXT: s_addc_u32 s5, s2, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v4i16_to_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s0, s2, 0xffff0000 +; VI-NEXT: s_add_i32 s1, s2, 4 +; VI-NEXT: s_and_b32 s2, s3, 0xffff0000 +; VI-NEXT: s_add_i32 s3, s3, 4 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_add_i32 s2, s2, 0x40000 +; VI-NEXT: s_add_i32 s0, s0, 0x40000 +; VI-NEXT: s_add_u32 s0, s0, 1 +; VI-NEXT: s_addc_u32 s1, s2, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v4i16_to_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v4i16_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, s2, 4 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s3, 4 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(1) %in, align 4 %add.v4i16 = add <4 x i16> %load, %bc = bitcast <4 x i16> %add.v4i16 to i64 @@ -178,8 +1282,91 @@ define amdgpu_kernel void @v4i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) ret void } -; CHECK-LABEL: {{^}}v4f16_to_i64: + define amdgpu_kernel void @v4f16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v4f16_to_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GCN-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, s5 +; GCN-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, s4 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, s5 +; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v4f16_to_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s0, s3, 16 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: v_mov_b32_e32 v6, s1 +; VI-NEXT: v_add_f16_e64 v4, s2, 4.0 +; VI-NEXT: v_add_f16_sdwa v5, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e64 v3, s3, 4.0 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: v_or_b32_e32 v3, v3, v5 +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v4f16_to_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v4f16_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, s2, 4.0 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v1, s3, 4.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load <4 x half>, ptr addrspace(1) %in, align 4 %add.v4half = fadd <4 x half> %load, %bc = bitcast <4 x half> %add.v4half to i64 @@ -188,8 +1375,86 @@ define amdgpu_kernel void @v4f16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) ret void } -; CHECK-LABEL: {{^}}bitcast_i64_to_v4i16: + define amdgpu_kernel void @bitcast_i64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN-LABEL: bitcast_i64_to_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_u32 s2, s4, 4 +; GCN-NEXT: s_addc_u32 s4, s5, 0 +; GCN-NEXT: s_and_b32 s5, s2, 0xffff0000 +; GCN-NEXT: s_add_i32 s2, s2, 1 +; GCN-NEXT: s_and_b32 s6, s4, 0xffff0000 +; GCN-NEXT: s_add_i32 s4, s4, 3 +; GCN-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_or_b32 s2, s5, s2 +; GCN-NEXT: s_or_b32 s4, s6, s4 +; GCN-NEXT: s_add_i32 s5, s2, 0x20000 +; GCN-NEXT: s_add_i32 s4, s4, 0x40000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_i64_to_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s2, 4 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_and_b32 s2, s0, 0xffff0000 +; VI-NEXT: s_add_i32 s0, s0, 1 +; VI-NEXT: s_and_b32 s3, s1, 0xffff0000 +; VI-NEXT: s_add_i32 s1, s1, 3 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_or_b32 s0, s2, s0 +; VI-NEXT: s_or_b32 s1, s3, s1 +; VI-NEXT: s_add_i32 s0, s0, 0x20000 +; VI-NEXT: s_add_i32 s1, s1, 0x40000 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_i64_to_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s2, s2, 4 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: v_pk_add_u16 v1, s3, v0 +; GFX9-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_i64_to_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s2, s2, 4 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, 0x40003, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %in, align 8 %add = add i64 %val, 4 %bc = bitcast i64 %add to <4 x i16> @@ -198,8 +1463,93 @@ define amdgpu_kernel void @bitcast_i64_to_v4i16(ptr addrspace(1) %out, ptr addrs ret void } -; CHECK-LABEL: {{^}}bitcast_i64_to_v4f16: + define amdgpu_kernel void @bitcast_i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN-LABEL: bitcast_i64_to_v4f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_u32 s4, s4, 4 +; GCN-NEXT: s_addc_u32 s5, s5, 0 +; GCN-NEXT: s_lshr_b32 s6, s4, 16 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GCN-NEXT: s_lshr_b32 s4, s5, 16 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, s6 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, s5 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, s4 +; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_add_f32_e32 v3, 0x41000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_i64_to_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0x4800 +; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s2, 4 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_lshr_b32 s3, s1, 16 +; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: v_add_f16_e64 v4, s1, 4.0 +; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v5, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v2 +; VI-NEXT: v_add_f16_e64 v2, s0, 1.0 +; VI-NEXT: v_or_b32_e32 v2, v2, v5 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_i64_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x48004400 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x40003c00 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s2, s2, 4 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: v_pk_add_f16 v1, s3, v0 +; GFX9-NEXT: v_pk_add_f16 v0, s2, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_i64_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s2, s2, 4 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2 +; GFX11-NEXT: v_pk_add_f16 v1, 0x48004400, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %in, align 8 %add = add i64 %val, 4 %bc = bitcast i64 %add to <4 x half> @@ -208,8 +1558,81 @@ define amdgpu_kernel void @bitcast_i64_to_v4f16(ptr addrspace(1) %out, ptr addrs ret void } -; CHECK-LABEL: {{^}}v4i16_to_v2f32: + define amdgpu_kernel void @v4i16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v4i16_to_v2f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s6, s4, 0xffff0000 +; GCN-NEXT: s_add_i32 s4, s4, 4 +; GCN-NEXT: s_and_b32 s7, s5, 0xffff0000 +; GCN-NEXT: s_add_i32 s5, s5, 4 +; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NEXT: s_or_b32 s4, s6, s4 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_add_i32 s4, s4, 0x40000 +; GCN-NEXT: s_add_i32 s5, s5, 0x40000 +; GCN-NEXT: v_add_f32_e64 v1, s5, 1.0 +; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v4i16_to_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s0, s3, 0xffff0000 +; VI-NEXT: s_add_i32 s1, s3, 4 +; VI-NEXT: s_and_b32 s3, s2, 0xffff0000 +; VI-NEXT: s_add_i32 s2, s2, 4 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_add_i32 s2, s2, 0x40000 +; VI-NEXT: s_add_i32 s0, s0, 0x40000 +; VI-NEXT: v_add_f32_e64 v3, s0, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s2, 1.0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v4i16_to_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v4i16_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, s3, 4 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 4 op_sel_hi:[1,0] +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v0 :: v_dual_add_f32 v0, 1.0, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(1) %in, align 4 %add.v4i16 = add <4 x i16> %load, %bc = bitcast <4 x i16> %add.v4i16 to <2 x float> @@ -218,8 +1641,90 @@ define amdgpu_kernel void @v4i16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; CHECK-LABEL: {{^}}v4f16_to_v2f32: + define amdgpu_kernel void @v4f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v4f16_to_v2f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, s5 +; GCN-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GCN-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, s5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, s4 +; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v0 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v3 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v4f16_to_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: v_mov_b32_e32 v6, s1 +; VI-NEXT: v_add_f16_e64 v3, s2, 4.0 +; VI-NEXT: v_add_f16_e64 v4, s3, 4.0 +; VI-NEXT: v_add_f16_sdwa v5, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v3, v5 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v5 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v4f16_to_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v4f16_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, s3, 4.0 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v2, s2, 4.0 op_sel_hi:[1,0] +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v0 :: v_dual_add_f32 v0, 1.0, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load <4 x half>, ptr addrspace(1) %in, align 4 %add.v4half = fadd <4 x half> %load, %bc = bitcast <4 x half> %add.v4half to <2 x float> @@ -228,8 +1733,81 @@ define amdgpu_kernel void @v4f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; CHECK-LABEL: {{^}}v2f32_to_v4i16: + define amdgpu_kernel void @v2f32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v2f32_to_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_f32_e64 v0, s4, 2.0 +; GCN-NEXT: v_add_f32_e64 v1, s5, 4.0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_or_b32_e32 v0, v3, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x40000, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x20000, v0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v2f32_to_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f32_e64 v2, s3, 4.0 +; VI-NEXT: v_add_f32_e64 v3, s2, 2.0 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x40000, v2 +; VI-NEXT: v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x20000, v2 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v2f32_to_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s2, 0x40003 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e64 v0, s4, 2.0 +; GFX9-NEXT: v_add_f32_e64 v1, s5, 4.0 +; GFX9-NEXT: v_pk_add_u16 v1, v1, s2 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, -2 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v2f32_to_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e64 v0, s3, 4.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 2.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_add_u16 v1, 0x40003, v0 +; GFX11-NEXT: v_pk_sub_u16 v0, v2, -2 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load <2 x float>, ptr addrspace(1) %in, align 4 %add.v2f32 = fadd <2 x float> %load, %bc = bitcast <2 x float> %add.v2f32 to <4 x i16> @@ -238,8 +1816,90 @@ define amdgpu_kernel void @v2f32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; CHECK-LABEL: {{^}}v2f32_to_v4f16: + define amdgpu_kernel void @v2f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v2f32_to_v4f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_f32_e64 v0, s5, 4.0 +; GCN-NEXT: v_add_f32_e64 v1, s4, 2.0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x41000000, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_add_f32_e32 v3, 2.0, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v1, v0, v1 +; GCN-NEXT: v_or_b32_e32 v0, v4, v2 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v2f32_to_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0x4800 +; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f32_e64 v4, s2, 2.0 +; VI-NEXT: v_add_f32_e64 v5, s3, 4.0 +; VI-NEXT: v_add_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 4.0, v5 +; VI-NEXT: v_add_f16_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 1.0, v4 +; VI-NEXT: v_or_b32_e32 v3, v5, v2 +; VI-NEXT: v_or_b32_e32 v2, v4, v6 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v2f32_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s2, 0x48004400 +; GFX9-NEXT: s_mov_b32 s3, 0x40003c00 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e64 v0, s4, 2.0 +; GFX9-NEXT: v_add_f32_e64 v1, s5, 4.0 +; GFX9-NEXT: v_pk_add_f16 v1, v1, s2 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v2f32_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e64 v0, s3, 4.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 2.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_add_f16 v1, 0x48004400, v0 +; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load <2 x float>, ptr addrspace(1) %in, align 4 %add.v2f32 = fadd <2 x float> %load, %bc = bitcast <2 x float> %add.v2f32 to <4 x half> @@ -248,8 +1908,82 @@ define amdgpu_kernel void @v2f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; CHECK-LABEL: {{^}}v4i16_to_v2i32: + define amdgpu_kernel void @v4i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v4i16_to_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s2, s4, 0xffff0000 +; GCN-NEXT: s_add_i32 s4, s4, 4 +; GCN-NEXT: s_and_b32 s6, s5, 0xffff0000 +; GCN-NEXT: s_add_i32 s5, s5, 4 +; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NEXT: s_or_b32 s2, s2, s4 +; GCN-NEXT: s_or_b32 s4, s6, s5 +; GCN-NEXT: s_add_i32 s4, s4, 0x40001 +; GCN-NEXT: s_add_i32 s5, s2, 0x40001 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v4i16_to_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s0, s3, 0xffff0000 +; VI-NEXT: s_add_i32 s1, s3, 4 +; VI-NEXT: s_and_b32 s3, s2, 0xffff0000 +; VI-NEXT: s_add_i32 s2, s2, 4 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_add_i32 s0, s0, 0x40001 +; VI-NEXT: s_add_i32 s2, s2, 0x40001 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v4i16_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v4i16_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, s3, 4 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 4 op_sel_hi:[1,0] +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(1) %in, align 4 %add.v4i16 = add <4 x i16> %load, %bc = bitcast <4 x i16> %add.v4i16 to <2 x i32> @@ -258,8 +1992,91 @@ define amdgpu_kernel void @v4i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; CHECK-LABEL: {{^}}v4f16_to_v2i32: + define amdgpu_kernel void @v4f16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v4f16_to_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, s5 +; GCN-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GCN-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, s5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, s4 +; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v3 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v4f16_to_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: v_mov_b32_e32 v6, s1 +; VI-NEXT: v_add_f16_e64 v3, s2, 4.0 +; VI-NEXT: v_add_f16_e64 v4, s3, 4.0 +; VI-NEXT: v_add_f16_sdwa v5, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v3, v5 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v5 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v4f16_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v4f16_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, s3, 4.0 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v2, s2, 4.0 op_sel_hi:[1,0] +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load <4 x half>, ptr addrspace(1) %in, align 4 %add.v4half = fadd <4 x half> %load, %bc = bitcast <4 x half> %add.v4half to <2 x i32> @@ -268,8 +2085,86 @@ define amdgpu_kernel void @v4f16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; CHECK-LABEL: {{^}}v2i32_to_v4i16: + define amdgpu_kernel void @v2i32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v2i32_to_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_i32 s2, s4, 2 +; GCN-NEXT: s_add_i32 s6, s5, 4 +; GCN-NEXT: s_add_i32 s5, s5, 7 +; GCN-NEXT: s_add_i32 s4, s4, 3 +; GCN-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NEXT: s_and_b32 s6, s6, 0xffff0000 +; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s2, s2, 0xffff0000 +; GCN-NEXT: s_or_b32 s5, s6, s5 +; GCN-NEXT: s_or_b32 s2, s2, s4 +; GCN-NEXT: s_add_i32 s5, s5, 0x40000 +; GCN-NEXT: s_add_i32 s4, s2, 0x20000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v2i32_to_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s0, s3, 4 +; VI-NEXT: s_add_i32 s1, s2, 2 +; VI-NEXT: s_add_i32 s2, s2, 3 +; VI-NEXT: s_add_i32 s3, s3, 7 +; VI-NEXT: s_and_b32 s1, s1, 0xffff0000 +; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_or_b32 s0, s0, s3 +; VI-NEXT: s_or_b32 s1, s1, s2 +; VI-NEXT: s_add_i32 s0, s0, 0x40000 +; VI-NEXT: s_add_i32 s1, s1, 0x20000 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v2i32_to_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s2, s4, 2 +; GFX9-NEXT: s_add_i32 s3, s5, 4 +; GFX9-NEXT: v_pk_add_u16 v1, s3, v0 +; GFX9-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v2i32_to_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s3, s3, 4 +; GFX11-NEXT: s_add_i32 s2, s2, 2 +; GFX11-NEXT: v_pk_add_u16 v1, 0x40003, s3 +; GFX11-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load <2 x i32>, ptr addrspace(1) %in, align 4 %add.v2i32 = add <2 x i32> %load, %bc = bitcast <2 x i32> %add.v2i32 to <4 x i16> @@ -278,8 +2173,93 @@ define amdgpu_kernel void @v2i32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ret void } -; CHECK-LABEL: {{^}}v2i32_to_v4f16: + define amdgpu_kernel void @v2i32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; GCN-LABEL: v2i32_to_v4f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_i32 s5, s5, 4 +; GCN-NEXT: s_add_i32 s4, s4, 2 +; GCN-NEXT: s_lshr_b32 s6, s5, 16 +; GCN-NEXT: s_lshr_b32 s7, s4, 16 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, s5 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, s7 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, s6 +; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_add_f32_e32 v3, 0x41000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: v2i32_to_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0x4800 +; VI-NEXT: v_mov_b32_e32 v4, 0x4000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s1, s3, 4 +; VI-NEXT: s_add_i32 s0, s2, 2 +; VI-NEXT: s_lshr_b32 s2, s1, 16 +; VI-NEXT: v_add_f16_e64 v3, s1, 4.0 +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: v_mov_b32_e32 v6, s1 +; VI-NEXT: v_add_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e64 v4, s0, 1.0 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v2i32_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x48004400 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x40003c00 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s2, s4, 2 +; GFX9-NEXT: s_add_i32 s3, s5, 4 +; GFX9-NEXT: v_pk_add_f16 v1, s3, v0 +; GFX9-NEXT: v_pk_add_f16 v0, s2, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v2i32_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s3, s3, 4 +; GFX11-NEXT: s_add_i32 s2, s2, 2 +; GFX11-NEXT: v_pk_add_f16 v1, 0x48004400, s3 +; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %load = load <2 x i32>, ptr addrspace(1) %in, align 4 %add.v2i32 = add <2 x i32> %load, %bc = bitcast <2 x i32> %add.v2i32 to <4 x half> @@ -290,10 +2270,1158 @@ define amdgpu_kernel void @v2i32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1 declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32 immarg) -; CHECK-LABEL: {{^}}bitcast_v4f32_to_v2i64: -; CHECK: s_buffer_load_{{dwordx4|b128}} + + define <2 x i64> @bitcast_v4f32_to_v2i64(<2 x i64> %arg) { - %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> undef, i32 0, i32 0) +; GCN-LABEL: bitcast_v4f32_to_v2i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v0 +; GCN-NEXT: s_buffer_load_dwordx4 s[8:11], s[4:7], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, s9, v5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_2 +; GCN-NEXT: ; %bb.1: +; GCN-NEXT: v_cvt_f32_u32_e32 v0, v4 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, v5 +; GCN-NEXT: s_mov_b32 s4, 0x4f800000 +; GCN-NEXT: s_mov_b32 s5, 0xcf800000 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 +; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v5, vcc +; GCN-NEXT: v_mov_b32_e32 v8, s9 +; GCN-NEXT: v_fma_f32 v0, v1, s4, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_fma_f32 v0, v1, s5, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_mul_lo_u32 v9, v6, v1 +; GCN-NEXT: v_mul_lo_u32 v10, v7, v0 +; GCN-NEXT: v_mul_hi_u32 v11, v6, v0 +; GCN-NEXT: v_mul_lo_u32 v12, v6, v0 +; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GCN-NEXT: v_mul_hi_u32 v11, v0, v12 +; GCN-NEXT: v_mul_hi_u32 v13, v1, v12 +; GCN-NEXT: v_mul_lo_u32 v12, v1, v12 +; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v9 +; GCN-NEXT: v_mul_lo_u32 v14, v0, v9 +; GCN-NEXT: v_mul_hi_u32 v15, v1, v9 +; GCN-NEXT: v_mul_lo_u32 v9, v1, v9 +; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, v10, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc +; GCN-NEXT: v_mul_hi_u32 v9, v6, v0 +; GCN-NEXT: v_mul_lo_u32 v7, v7, v0 +; GCN-NEXT: v_mul_lo_u32 v10, v6, v0 +; GCN-NEXT: v_mul_lo_u32 v6, v6, v1 +; GCN-NEXT: v_mul_hi_u32 v11, v1, v10 +; GCN-NEXT: v_mul_lo_u32 v12, v1, v10 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v6 +; GCN-NEXT: v_mul_hi_u32 v9, v0, v6 +; GCN-NEXT: v_mul_lo_u32 v13, v0, v6 +; GCN-NEXT: v_mul_lo_u32 v6, v1, v6 +; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; GCN-NEXT: v_mul_hi_u32 v6, s8, v0 +; GCN-NEXT: v_mul_hi_u32 v7, s9, v0 +; GCN-NEXT: v_mul_lo_u32 v0, s9, v0 +; GCN-NEXT: v_mul_hi_u32 v9, s8, v1 +; GCN-NEXT: v_mul_lo_u32 v10, s8, v1 +; GCN-NEXT: v_mul_hi_u32 v11, s9, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s9, v1 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v9, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_hi_u32 v6, v4, v0 +; GCN-NEXT: v_mul_lo_u32 v7, v5, v0 +; GCN-NEXT: v_mul_lo_u32 v9, v4, v0 +; GCN-NEXT: v_mul_lo_u32 v10, v4, v1 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 2, v0 +; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v0 +; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GCN-NEXT: v_sub_i32_e32 v7, vcc, s9, v6 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, s8, v9 +; GCN-NEXT: v_subb_u32_e64 v7, s[4:5], v7, v5, vcc +; GCN-NEXT: v_subb_u32_e32 v6, vcc, v8, v6, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4 +; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GCN-NEXT: v_sub_i32_e32 v9, vcc, v9, v4 +; GCN-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 +; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5 +; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GCN-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cndmask_b32_e32 v4, v14, v12, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v4, v13, v11, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN-NEXT: .LBB28_2: ; %Flow1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB28_4 +; GCN-NEXT: ; %bb.3: +; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, v4 +; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_mul_lo_u32 v0, v0, v1 +; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: v_mul_lo_u32 v1, v0, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, s8, v1 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, v1, v4 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: .LBB28_4: +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_or_b32_e32 v5, s11, v3 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_6 +; GCN-NEXT: ; %bb.5: +; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 +; GCN-NEXT: s_mov_b32 s4, 0x4f800000 +; GCN-NEXT: s_mov_b32 s5, 0xcf800000 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 +; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v8, s11 +; GCN-NEXT: v_fma_f32 v4, v5, s4, v4 +; GCN-NEXT: v_rcp_f32_e32 v4, v4 +; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GCN-NEXT: v_trunc_f32_e32 v5, v5 +; GCN-NEXT: v_fma_f32 v4, v5, s5, v4 +; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GCN-NEXT: v_mul_lo_u32 v9, v6, v5 +; GCN-NEXT: v_mul_lo_u32 v10, v7, v4 +; GCN-NEXT: v_mul_hi_u32 v11, v6, v4 +; GCN-NEXT: v_mul_lo_u32 v12, v6, v4 +; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GCN-NEXT: v_mul_hi_u32 v11, v4, v12 +; GCN-NEXT: v_mul_hi_u32 v13, v5, v12 +; GCN-NEXT: v_mul_lo_u32 v12, v5, v12 +; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GCN-NEXT: v_mul_hi_u32 v10, v4, v9 +; GCN-NEXT: v_mul_lo_u32 v14, v4, v9 +; GCN-NEXT: v_mul_hi_u32 v15, v5, v9 +; GCN-NEXT: v_mul_lo_u32 v9, v5, v9 +; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, v10, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v10, vcc +; GCN-NEXT: v_mul_hi_u32 v9, v6, v4 +; GCN-NEXT: v_mul_lo_u32 v7, v7, v4 +; GCN-NEXT: v_mul_lo_u32 v10, v6, v4 +; GCN-NEXT: v_mul_lo_u32 v6, v6, v5 +; GCN-NEXT: v_mul_hi_u32 v11, v5, v10 +; GCN-NEXT: v_mul_lo_u32 v12, v5, v10 +; GCN-NEXT: v_mul_hi_u32 v10, v4, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GCN-NEXT: v_mul_hi_u32 v7, v5, v6 +; GCN-NEXT: v_mul_hi_u32 v9, v4, v6 +; GCN-NEXT: v_mul_lo_u32 v13, v4, v6 +; GCN-NEXT: v_mul_lo_u32 v6, v5, v6 +; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; GCN-NEXT: v_mul_hi_u32 v6, s10, v4 +; GCN-NEXT: v_mul_hi_u32 v7, s11, v4 +; GCN-NEXT: v_mul_lo_u32 v4, s11, v4 +; GCN-NEXT: v_mul_hi_u32 v9, s10, v5 +; GCN-NEXT: v_mul_lo_u32 v10, s10, v5 +; GCN-NEXT: v_mul_hi_u32 v11, s11, v5 +; GCN-NEXT: v_mul_lo_u32 v5, s11, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 +; GCN-NEXT: v_mul_lo_u32 v7, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v9, v2, v4 +; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 2, v4 +; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v4 +; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GCN-NEXT: v_sub_i32_e32 v7, vcc, s11, v6 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, s10, v9 +; GCN-NEXT: v_subb_u32_e64 v7, s[4:5], v7, v3, vcc +; GCN-NEXT: v_subb_u32_e32 v6, vcc, v8, v6, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v2 +; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GCN-NEXT: v_sub_i32_e32 v9, vcc, v9, v2 +; GCN-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 +; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 +; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GCN-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v2, v14, v12, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 +; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v2, v13, v11, vcc +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[4:5] +; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN-NEXT: .LBB28_6: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB28_8 +; GCN-NEXT: ; %bb.7: +; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GCN-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GCN-NEXT: v_mul_lo_u32 v3, v3, v4 +; GCN-NEXT: v_mul_hi_u32 v3, v4, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v3 +; GCN-NEXT: v_mul_lo_u32 v4, v3, v2 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, v4, v2 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 +; GCN-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: .LBB28_8: +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v4 +; GCN-NEXT: v_mov_b32_e32 v3, v5 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v2i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_buffer_load_dwordx4 s[8:11], s[4:7], 0x0 +; VI-NEXT: v_mov_b32_e32 v5, v1 +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_or_b32_e32 v1, s9, v5 +; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: ; %bb.1: +; VI-NEXT: v_cvt_f32_u32_e32 v0, v4 +; VI-NEXT: v_cvt_f32_u32_e32 v1, v5 +; VI-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 +; VI-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc +; VI-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 +; VI-NEXT: v_rcp_f32_e32 v0, v0 +; VI-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; VI-NEXT: v_trunc_f32_e32 v1, v1 +; VI-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 +; VI-NEXT: v_cvt_u32_f32_e32 v8, v1 +; VI-NEXT: v_cvt_u32_f32_e32 v9, v0 +; VI-NEXT: v_mul_lo_u32 v6, v10, v8 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v9, 0 +; VI-NEXT: v_mul_lo_u32 v7, v11, v9 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v6 +; VI-NEXT: v_add_u32_e32 v13, vcc, v1, v7 +; VI-NEXT: v_mul_hi_u32 v12, v9, v0 +; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v13, 0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v6 +; VI-NEXT: v_addc_u32_e32 v14, vcc, 0, v7, vcc +; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v13, 0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v12, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, v14, v1, vcc +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, v9, v0 +; VI-NEXT: v_addc_u32_e32 v13, vcc, v8, v1, vcc +; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v12, 0 +; VI-NEXT: v_mul_lo_u32 v8, v10, v13 +; VI-NEXT: v_mul_lo_u32 v9, v11, v12 +; VI-NEXT: v_mul_hi_u32 v10, v12, v0 +; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v0, 0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v8 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v9 +; VI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v1, 0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v1, 0 +; VI-NEXT: v_add_u32_e32 v8, vcc, v10, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; VI-NEXT: v_addc_u32_e32 v6, vcc, v9, v7, vcc +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v12, v0 +; VI-NEXT: v_addc_u32_e32 v7, vcc, v13, v1, vcc +; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v7, 0 +; VI-NEXT: v_mul_hi_u32 v8, s8, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v6, 0 +; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v7, 0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v8, v0 +; VI-NEXT: v_addc_u32_e32 v0, vcc, v9, v1, vcc +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v0, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; VI-NEXT: v_mul_lo_u32 v8, v4, v7 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0 +; VI-NEXT: v_mul_lo_u32 v9, v5, v6 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v8 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v9 +; VI-NEXT: v_sub_u32_e32 v8, vcc, s9, v1 +; VI-NEXT: v_sub_u32_e32 v0, vcc, s8, v0 +; VI-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v5, vcc +; VI-NEXT: v_sub_u32_e64 v9, s[4:5], v0, v4 +; VI-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5] +; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 +; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 +; VI-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5 +; VI-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5] +; VI-NEXT: v_add_u32_e64 v9, s[4:5], 2, v6 +; VI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, v7, s[4:5] +; VI-NEXT: v_add_u32_e64 v11, s[4:5], 1, v6 +; VI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, v7, s[4:5] +; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 +; VI-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5] +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v10, v1, vcc +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; VI-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v11, v9, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; VI-NEXT: ; implicit-def: $vgpr4_vgpr5 +; VI-NEXT: .LBB28_2: ; %Flow1 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; VI-NEXT: s_cbranch_execz .LBB28_4 +; VI-NEXT: ; %bb.3: +; VI-NEXT: v_cvt_f32_u32_e32 v0, v4 +; VI-NEXT: v_sub_u32_e32 v1, vcc, 0, v4 +; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VI-NEXT: v_mul_lo_u32 v1, v1, v0 +; VI-NEXT: v_mul_hi_u32 v1, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_mul_hi_u32 v0, s8, v0 +; VI-NEXT: v_mul_lo_u32 v1, v0, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v0 +; VI-NEXT: v_sub_u32_e32 v1, vcc, s8, v1 +; VI-NEXT: v_sub_u32_e32 v6, vcc, v1, v4 +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v0 +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: .LBB28_4: +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_or_b32_e32 v5, s11, v3 +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; VI-NEXT: ; implicit-def: $vgpr4_vgpr5 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_6 +; VI-NEXT: ; %bb.5: +; VI-NEXT: v_cvt_f32_u32_e32 v4, v2 +; VI-NEXT: v_cvt_f32_u32_e32 v5, v3 +; VI-NEXT: v_sub_u32_e32 v10, vcc, 0, v2 +; VI-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc +; VI-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4 +; VI-NEXT: v_rcp_f32_e32 v4, v4 +; VI-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; VI-NEXT: v_trunc_f32_e32 v5, v5 +; VI-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4 +; VI-NEXT: v_cvt_u32_f32_e32 v8, v5 +; VI-NEXT: v_cvt_u32_f32_e32 v9, v4 +; VI-NEXT: v_mul_lo_u32 v6, v10, v8 +; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 +; VI-NEXT: v_mul_lo_u32 v7, v11, v9 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, v5, v7 +; VI-NEXT: v_mul_hi_u32 v12, v9, v4 +; VI-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v5 +; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v6, vcc +; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 +; VI-NEXT: v_add_u32_e32 v4, vcc, v12, v4 +; VI-NEXT: v_addc_u32_e32 v4, vcc, v13, v5, vcc +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, v9, v4 +; VI-NEXT: v_addc_u32_e32 v13, vcc, v8, v5, vcc +; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 +; VI-NEXT: v_mul_lo_u32 v8, v10, v13 +; VI-NEXT: v_mul_lo_u32 v9, v11, v12 +; VI-NEXT: v_mul_hi_u32 v10, v12, v4 +; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v4, 0 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v8 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v9 +; VI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, 0 +; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v5, 0 +; VI-NEXT: v_add_u32_e32 v8, vcc, v10, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; VI-NEXT: v_addc_u32_e32 v6, vcc, v9, v7, vcc +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v12, v4 +; VI-NEXT: v_addc_u32_e32 v7, vcc, v13, v5, vcc +; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s10, v7, 0 +; VI-NEXT: v_mul_hi_u32 v8, s10, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v4 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s11, v6, 0 +; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s11, v7, 0 +; VI-NEXT: v_add_u32_e32 v4, vcc, v8, v4 +; VI-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v4, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; VI-NEXT: v_mul_lo_u32 v8, v2, v7 +; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0 +; VI-NEXT: v_mul_lo_u32 v9, v3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v8 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v9 +; VI-NEXT: v_sub_u32_e32 v8, vcc, s11, v5 +; VI-NEXT: v_sub_u32_e32 v4, vcc, s10, v4 +; VI-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v3, vcc +; VI-NEXT: v_sub_u32_e64 v9, s[4:5], v4, v2 +; VI-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5] +; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 +; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2 +; VI-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 +; VI-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5] +; VI-NEXT: v_add_u32_e64 v9, s[4:5], 2, v6 +; VI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, v7, s[4:5] +; VI-NEXT: v_add_u32_e64 v11, s[4:5], 1, v6 +; VI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, v7, s[4:5] +; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 +; VI-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5] +; VI-NEXT: v_mov_b32_e32 v10, s11 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v10, v5, vcc +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 +; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e64 v2, v11, v9, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; VI-NEXT: ; implicit-def: $vgpr2_vgpr3 +; VI-NEXT: .LBB28_6: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; VI-NEXT: s_cbranch_execz .LBB28_8 +; VI-NEXT: ; %bb.7: +; VI-NEXT: v_cvt_f32_u32_e32 v3, v2 +; VI-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 +; VI-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; VI-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; VI-NEXT: v_cvt_u32_f32_e32 v3, v3 +; VI-NEXT: v_mul_lo_u32 v4, v4, v3 +; VI-NEXT: v_mul_hi_u32 v4, v3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; VI-NEXT: v_mul_hi_u32 v3, s10, v3 +; VI-NEXT: v_mul_lo_u32 v4, v3, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v3 +; VI-NEXT: v_sub_u32_e32 v4, vcc, s10, v4 +; VI-NEXT: v_sub_u32_e32 v6, vcc, v4, v2 +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v3 +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc +; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: .LBB28_8: +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v2, v4 +; VI-NEXT: v_mov_b32_e32 v3, v5 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_buffer_load_dwordx4 s[8:11], s[4:7], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s9, v5 +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v5 +; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v5, vcc +; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v0 +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8 +; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v9, 0 +; GFX9-NEXT: v_add3_u32 v12, v1, v6, v7 +; GFX9-NEXT: v_mul_hi_u32 v1, v9, v0 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v12, 0 +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v1, v6 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v7, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v12, 0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v13, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v14, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v1, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 +; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v12, 0 +; GFX9-NEXT: v_add3_u32 v1, v1, v6, v7 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v1, 0 +; GFX9-NEXT: v_mul_hi_u32 v10, v12, v0 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v1, 0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v0, 0 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v9, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v7, 0 +; GFX9-NEXT: v_mul_hi_u32 v8, s8, v6 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v9, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v5, v6 +; GFX9-NEXT: v_mul_lo_u32 v9, v4, v7 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0 +; GFX9-NEXT: v_add3_u32 v1, v1, v9, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, s9, v1 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 +; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v8, v5, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v0, v4 +; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v9, s[4:5], 2, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, v7, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v11, s[4:5], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, v7, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v10, s9 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v10, v1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v11, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX9-NEXT: .LBB28_2: ; %Flow1 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; GFX9-NEXT: s_cbranch_execz .LBB28_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v4 +; GFX9-NEXT: v_sub_u32_e32 v1, 0, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 +; GFX9-NEXT: v_sub_u32_e32 v1, s8, v1 +; GFX9-NEXT: v_sub_u32_e32 v6, v1, v4 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: .LBB28_4: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v5, s11, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB28_6 +; GFX9-NEXT: ; %bb.5: +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 +; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc +; GFX9-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8 +; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 +; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 +; GFX9-NEXT: v_mul_hi_u32 v12, v9, v4 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 +; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 +; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 +; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s10, v7, 0 +; GFX9-NEXT: v_mul_hi_u32 v8, s10, v6 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s11, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s11, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6 +; GFX9-NEXT: v_mul_lo_u32 v9, v2, v7 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0 +; GFX9-NEXT: v_add3_u32 v5, v5, v9, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, s11, v5 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s10, v4 +; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v8, v3, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v4, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v9, s[4:5], 2, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, v7, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v11, s[4:5], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, v7, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v10, s11 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v10, v5, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v11, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX9-NEXT: .LBB28_6: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; GFX9-NEXT: s_cbranch_execz .LBB28_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2 +; GFX9-NEXT: v_sub_u32_e32 v4, 0, v2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, s10, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 +; GFX9-NEXT: v_sub_u32_e32 v4, s10, v4 +; GFX9-NEXT: v_sub_u32_e32 v6, v4, v2 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: .LBB28_8: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_buffer_load_b128 s[4:7], s[0:3], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v1, s5, v5 +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: ; %bb.1: +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v4 +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v5 +; GFX11-NEXT: v_sub_co_u32 v10, vcc_lo, 0, v4 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, 0, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 +; GFX11-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX11-NEXT: v_trunc_f32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 +; GFX11-NEXT: v_cvt_u32_f32_e32 v12, v1 +; GFX11-NEXT: v_cvt_u32_f32_e32 v13, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_lo_u32 v6, v10, v12 +; GFX11-NEXT: v_mul_lo_u32 v7, v11, v13 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v10, v13, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v14, v1, v6, v7 +; GFX11-NEXT: v_mul_hi_u32 v15, v13, v0 +; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v12, v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v13, v14, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v12, v14, 0 +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v15, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v9, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v13, vcc_lo, v13, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v12, v1, vcc_lo +; GFX11-NEXT: v_mul_lo_u32 v6, v11, v13 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v10, v13, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_lo_u32 v7, v10, v12 +; GFX11-NEXT: v_mul_hi_u32 v11, v13, v0 +; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v12, v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v10, v1, v7, v6 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v13, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v12, v10, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v11, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v8 +; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v9, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v13, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v12, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mul_hi_u32 v11, s4, v8 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, s5, v8, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, s4, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, s5, v10, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v11, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, v1, v7, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v0, v8 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mul_lo_u32 v8, v5, v6 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v6, 0 +; GFX11-NEXT: v_mul_lo_u32 v9, v4, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, s4, v0 +; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v8, s5, v1 +; GFX11-NEXT: v_sub_co_ci_u32_e64 v8, s0, v8, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v9, s0, v6, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v10, s0, 0, v7, s0 +; GFX11-NEXT: v_sub_co_u32 v11, s0, v0, v4 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo +; GFX11-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0 +; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v11, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, v1, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo +; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v8, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo +; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, v6, 1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v7, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v12, v10 :: v_dual_cndmask_b32 v4, v11, v9 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v1 :: v_dual_cndmask_b32 v0, v6, v4 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB28_2: ; %Flow1 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s1 +; GFX11-NEXT: s_cbranch_execz .LBB28_4 +; GFX11-NEXT: ; %bb.3: +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v4 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v1, v1, v0 +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_lo_u32 v1, v0, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v6, v1, v4 +; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v4 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v6 :: v_dual_cndmask_b32 v0, v0, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v4 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: .LBB28_4: +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_or_b32_e32 v5, s7, v3 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB28_6 +; GFX11-NEXT: ; %bb.5: +; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GFX11-NEXT: v_cvt_f32_u32_e32 v5, v3 +; GFX11-NEXT: v_sub_co_u32 v11, vcc_lo, 0, v2 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v12, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmamk_f32 v4, v5, 0x4f800000, v4 +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GFX11-NEXT: v_trunc_f32_e32 v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fmamk_f32 v4, v5, 0xcf800000, v4 +; GFX11-NEXT: v_cvt_u32_f32_e32 v13, v5 +; GFX11-NEXT: v_cvt_u32_f32_e32 v14, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_lo_u32 v6, v11, v13 +; GFX11-NEXT: v_mul_lo_u32 v7, v12, v14 +; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v11, v14, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v15, v5, v6, v7 +; GFX11-NEXT: v_mul_hi_u32 v16, v14, v4 +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v13, v4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v14, v15, 0 +; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v13, v15, 0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v8, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v10, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v13, v5, vcc_lo +; GFX11-NEXT: v_mul_lo_u32 v6, v12, v14 +; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v11, v14, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_lo_u32 v7, v11, v13 +; GFX11-NEXT: v_mul_hi_u32 v12, v14, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v11, v5, v7, v6 +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v13, v4, 0 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v14, v11, 0 +; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v13, v11, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v12, v5 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7 +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v8, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v10, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v9 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v14, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v13, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mul_hi_u32 v11, s6, v8 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, s7, v8, 0 +; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, s6, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, s7, v10, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v11, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v4, v8 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mul_lo_u32 v8, v3, v6 +; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v2, v6, 0 +; GFX11-NEXT: v_mul_lo_u32 v9, v2, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, s6, v4 +; GFX11-NEXT: v_add3_u32 v5, v5, v9, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v8, s7, v5 +; GFX11-NEXT: v_sub_co_ci_u32_e64 v8, s0, v8, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v9, s0, v6, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v10, s0, 0, v7, s0 +; GFX11-NEXT: v_sub_co_u32 v11, s0, v4, v2 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo +; GFX11-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0 +; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v11, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, v5, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo +; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v8, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo +; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo +; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v2, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, v6, 1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v7, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v12, v10 :: v_dual_cndmask_b32 v4, v11, v9 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v3 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: .LBB28_6: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s1 +; GFX11-NEXT: s_cbranch_execz .LBB28_8 +; GFX11-NEXT: ; %bb.7: +; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v4, v4, v3 +; GFX11-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX11-NEXT: v_mul_hi_u32 v3, s6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_lo_u32 v4, v3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v3 +; GFX11-NEXT: v_sub_nc_u32_e32 v4, s6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v6, v4, v2 +; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v3, v5 :: v_dual_mov_b32 v5, 0 +; GFX11-NEXT: .LBB28_8: +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> poison, i32 0, i32 0) %cast = bitcast <4 x float> %val to <2 x i64> %div = udiv <2 x i64> %cast, %arg ret <2 x i64> %div @@ -301,8 +3429,44 @@ define <2 x i64> @bitcast_v4f32_to_v2i64(<2 x i64> %arg) { declare half @llvm.canonicalize.f16(half) -; CHECK-LABEL: {{^}}bitcast_f32_to_v1i32: + define amdgpu_kernel void @bitcast_f32_to_v1i32(ptr addrspace(1) %out) { +; GCN-LABEL: bitcast_f32_to_v1i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x387c0000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_f32_to_v1i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0x387c0000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_f32_to_v1i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x387c0000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_f32_to_v1i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x387c0000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %f16 = call arcp afn half @llvm.canonicalize.f16(half 0xH03F0) %f32 = fpext half %f16 to float %v = bitcast float %f32 to <1 x i32> @@ -311,8 +3475,91 @@ define amdgpu_kernel void @bitcast_f32_to_v1i32(ptr addrspace(1) %out) { ret void } -; CHECK-LABEL: {{^}}bitcast_v4i64_to_v16i16: + define amdgpu_kernel void @bitcast_v4i64_to_v16i16(i32 %cond, ptr addrspace(1) %out, <4 x i64> %value) { +; GCN-LABEL: bitcast_v4i64_to_v16i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v4i64_to_v16i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s9, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s9, 0 +; VI-NEXT: s_add_u32 s6, s4, 16 +; VI-NEXT: s_addc_u32 s7, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v4i64_to_v16i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v4i64_to_v16i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_mov_b32_e32 v6, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -329,8 +3576,130 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v4f64_to_v16f16: + define amdgpu_kernel void @bitcast_v4f64_to_v16f16(i32 %cond, ptr addrspace(1) %out, <4 x double> %value) { +; GCN-LABEL: bitcast_v4f64_to_v16f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s11, 0 +; GCN-NEXT: s_mov_b32 s18, 0 +; GCN-NEXT: s_mov_b32 s15, 0 +; GCN-NEXT: s_mov_b32 s19, 0 +; GCN-NEXT: s_mov_b32 s16, 0 +; GCN-NEXT: s_mov_b32 s20, 0 +; GCN-NEXT: s_mov_b32 s17, 0 +; GCN-NEXT: s_mov_b32 s21, 0 +; GCN-NEXT: s_mov_b32 s8, 0 +; GCN-NEXT: s_mov_b32 s12, 0 +; GCN-NEXT: s_mov_b32 s9, 0 +; GCN-NEXT: s_mov_b32 s13, 0 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s14, 0 +; GCN-NEXT: s_mov_b32 s7, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, s18 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, s11 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, s19 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, s15 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, s20 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, s16 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, s21 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, s17 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, s12 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, s8 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, s13 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, s9 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, s14 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, s10 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, s6 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, s7 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_or_b32_e32 v4, v9, v8 +; GCN-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NEXT: v_or_b32_e32 v6, v13, v12 +; GCN-NEXT: v_or_b32_e32 v7, v15, v14 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v4f64_to_v16f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s9, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s9, 0 +; VI-NEXT: s_add_u32 s6, s4, 16 +; VI-NEXT: s_addc_u32 s7, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v4f64_to_v16f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v4f64_to_v16f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_mov_b32_e32 v6, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -347,8 +3716,91 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v16i16_to_v4i64: + define amdgpu_kernel void @bitcast_v16i16_to_v4i64(i32 %cond, ptr addrspace(1) %out, <16 x i16> %value) { +; GCN-LABEL: bitcast_v16i16_to_v4i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v16i16_to_v4i64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s9, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s9, 0 +; VI-NEXT: s_add_u32 s6, s4, 16 +; VI-NEXT: s_addc_u32 s7, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v16i16_to_v4i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v16i16_to_v4i64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_mov_b32_e32 v6, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -365,8 +3817,91 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v16f16_to_v4f64: + define amdgpu_kernel void @bitcast_v16f16_to_v4f64(i32 %cond, ptr addrspace(1) %out, <16 x half> %value) { +; GCN-LABEL: bitcast_v16f16_to_v4f64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v16f16_to_v4f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s9, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s9, 0 +; VI-NEXT: s_add_u32 s6, s4, 16 +; VI-NEXT: s_addc_u32 s7, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v16f16_to_v4f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v16f16_to_v4f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_mov_b32_e32 v6, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -383,9 +3918,110 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v20f16_to_v5f64: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v20f16_to_v5f64(i32 %cond, ptr addrspace(1) %out, <20 x half> %value) { +; GCN-LABEL: bitcast_v20f16_to_v5f64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v20f16_to_v5f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s8, s4, 16 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v20f16_to_v5f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v20f16_to_v5f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -402,9 +4038,110 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v10f32_to_v5f64: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v10f32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) { +; GCN-LABEL: bitcast_v10f32_to_v5f64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v10f32_to_v5f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s8, s4, 16 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v10f32_to_v5f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v10f32_to_v5f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -421,9 +4158,110 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v10i32_to_v5f64: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v10i32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) { +; GCN-LABEL: bitcast_v10i32_to_v5f64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v10i32_to_v5f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s8, s4, 16 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v10i32_to_v5f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v10i32_to_v5f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -440,9 +4278,110 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v10f32_to_v5i64: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v10f32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) { +; GCN-LABEL: bitcast_v10f32_to_v5i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v10f32_to_v5i64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s8, s4, 16 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v10f32_to_v5i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v10f32_to_v5i64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -459,9 +4398,110 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v10i32_to_v5i64: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v10i32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) { +; GCN-LABEL: bitcast_v10i32_to_v5i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v10i32_to_v5i64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s8, s4, 16 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v10i32_to_v5i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v10i32_to_v5i64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -478,9 +4518,110 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v40i8_to_v5f64: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v40i8_to_v5f64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) { +; GCN-LABEL: bitcast_v40i8_to_v5f64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v40i8_to_v5f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s8, s4, 16 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v40i8_to_v5f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v40i8_to_v5f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -497,9 +4638,110 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v40i8_to_v5i64: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v40i8_to_v5i64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) { +; GCN-LABEL: bitcast_v40i8_to_v5i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v40i8_to_v5i64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s8, s4, 16 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v40i8_to_v5i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v40i8_to_v5i64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -516,9 +4758,109 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v5f64_to_v10f32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v5f64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) { +; GCN-LABEL: bitcast_v5f64_to_v10f32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v5f64_to_v10f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s7, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s7, 0 +; VI-NEXT: s_add_u32 s8, s4, 16 +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v5f64_to_v10f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 +; GFX9-NEXT: s_cmp_lg_u32 s7, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v5f64_to_v10f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 +; GFX11-NEXT: s_cmp_lg_u32 s7, 0 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -535,9 +4877,109 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v5f64_to_v10i32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v5f64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) { +; GCN-LABEL: bitcast_v5f64_to_v10i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v5f64_to_v10i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s7, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s7, 0 +; VI-NEXT: s_add_u32 s8, s4, 16 +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v5f64_to_v10i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 +; GFX9-NEXT: s_cmp_lg_u32 s7, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v5f64_to_v10i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 +; GFX11-NEXT: s_cmp_lg_u32 s7, 0 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -554,9 +4996,109 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v5i64_to_v10f32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v5i64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) { +; GCN-LABEL: bitcast_v5i64_to_v10f32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v5i64_to_v10f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s7, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s7, 0 +; VI-NEXT: s_add_u32 s8, s4, 16 +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v5i64_to_v10f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 +; GFX9-NEXT: s_cmp_lg_u32 s7, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v5i64_to_v10f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 +; GFX11-NEXT: s_cmp_lg_u32 s7, 0 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -573,9 +5115,109 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v5i64_to_v10i32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v5i64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) { +; GCN-LABEL: bitcast_v5i64_to_v10i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s15, 0xf000 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 +; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v5i64_to_v10i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s7, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s7, 0 +; VI-NEXT: s_add_u32 s8, s4, 16 +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v5i64_to_v10i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 +; GFX9-NEXT: s_cmp_lg_u32 s7, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v5i64_to_v10i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 +; GFX11-NEXT: s_cmp_lg_u32 s7, 0 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -592,9 +5234,115 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v6f64_to_v12i32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v6f64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) { +; GCN-LABEL: bitcast_v6f64_to_v12i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NEXT: s_mov_b32 s18, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: v_mov_b32_e32 v10, s0 +; GCN-NEXT: v_mov_b32_e32 v11, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v6f64_to_v12i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s9, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s9, 0 +; VI-NEXT: s_add_u32 s10, s4, 16 +; VI-NEXT: s_addc_u32 s11, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v6f64_to_v12i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v6f64_to_v12i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v10, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32 +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -611,9 +5359,115 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v6f64_to_v12f32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v6f64_to_v12f32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) { +; GCN-LABEL: bitcast_v6f64_to_v12f32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NEXT: s_mov_b32 s18, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: v_mov_b32_e32 v10, s0 +; GCN-NEXT: v_mov_b32_e32 v11, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v6f64_to_v12f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s9, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s9, 0 +; VI-NEXT: s_add_u32 s10, s4, 16 +; VI-NEXT: s_addc_u32 s11, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v6f64_to_v12f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v6f64_to_v12f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v10, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32 +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -630,9 +5484,117 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v12i32_to_v6i64: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v12i32_to_v6i64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) { +; GCN-LABEL: bitcast_v12i32_to_v6i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NEXT: s_mov_b32 s18, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: v_mov_b32_e32 v10, s0 +; GCN-NEXT: v_mov_b32_e32 v11, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v12i32_to_v6i64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s10, s4, 16 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s11, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v12i32_to_v6i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v12i32_to_v6i64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v10, s0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -649,9 +5611,117 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v12i32_to_v6f64: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v12i32_to_v6f64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) { +; GCN-LABEL: bitcast_v12i32_to_v6f64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NEXT: s_mov_b32 s18, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: v_mov_b32_e32 v10, s0 +; GCN-NEXT: v_mov_b32_e32 v11, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v12i32_to_v6f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s10, s4, 16 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s11, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v12i32_to_v6f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v12i32_to_v6f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v10, s0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -668,9 +5738,115 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v6i64_to_v12i32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v6i64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x i64> %value) { +; GCN-LABEL: bitcast_v6i64_to_v12i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NEXT: s_mov_b32 s18, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: v_mov_b32_e32 v10, s0 +; GCN-NEXT: v_mov_b32_e32 v11, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v6i64_to_v12i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s9, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s9, 0 +; VI-NEXT: s_add_u32 s10, s4, 16 +; VI-NEXT: s_addc_u32 s11, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v6i64_to_v12i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v6i64_to_v12i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v10, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32 +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -687,9 +5863,131 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v7i64_to_v14i32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v7i64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x i64> %value) { +; GCN-LABEL: bitcast_v7i64_to_v14i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NEXT: s_mov_b32 s18, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v12, s0 +; GCN-NEXT: v_mov_b32_e32 v13, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: v_mov_b32_e32 v10, s0 +; GCN-NEXT: v_mov_b32_e32 v11, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 +; GCN-NEXT: buffer_store_dwordx2 v[12:13], off, s[16:19], 0 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v7i64_to_v14i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s12, s4, 16 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s13, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s8, s4, 48 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v7i64_to_v14i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:48 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v7i64_to_v14i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v14, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v13, s0 +; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v10, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v14, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v14, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b64 v14, v[12:13], s[4:5] offset:48 +; GFX11-NEXT: global_store_b128 v14, v[8:11], s[4:5] offset:32 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -706,9 +6004,131 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v7f64_to_v14i32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v7f64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x double> %value) { +; GCN-LABEL: bitcast_v7f64_to_v14i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NEXT: s_mov_b32 s18, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v7, s0 +; GCN-NEXT: v_mov_b32_e32 v12, s0 +; GCN-NEXT: v_mov_b32_e32 v13, s0 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NEXT: v_mov_b32_e32 v10, s0 +; GCN-NEXT: v_mov_b32_e32 v11, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 +; GCN-NEXT: buffer_store_dwordx2 v[12:13], off, s[16:19], 0 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v7f64_to_v14i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s12, s4, 16 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s13, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s8, s4, 48 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v7f64_to_v14i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:48 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v7f64_to_v14i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v14, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v13, s0 +; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v10, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v14, v[0:3], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v14, v[4:7], s[4:5] +; GFX11-NEXT: global_store_b64 v14, v[12:13], s[4:5] offset:48 +; GFX11-NEXT: global_store_b128 v14, v[8:11], s[4:5] offset:32 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -725,9 +6145,156 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v9i64_to_v18i32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v9i64_to_v18i32(i32 %cond, ptr addrspace(1) %out, <9 x i64> %value) { +; GCN-LABEL: bitcast_v9i64_to_v18i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s6 +; GCN-NEXT: v_mov_b32_e32 v8, s6 +; GCN-NEXT: v_mov_b32_e32 v9, s6 +; GCN-NEXT: v_mov_b32_e32 v10, s6 +; GCN-NEXT: v_mov_b32_e32 v11, s6 +; GCN-NEXT: v_mov_b32_e32 v12, s6 +; GCN-NEXT: v_mov_b32_e32 v13, s6 +; GCN-NEXT: v_mov_b32_e32 v14, s6 +; GCN-NEXT: v_mov_b32_e32 v15, s6 +; GCN-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NEXT: v_mov_b32_e32 v17, s6 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[16:17], off, s[0:3], 0 offset:64 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v9i64_to_v18i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s16, s4, 48 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s17, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: s_add_u32 s12, s4, 32 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s13, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: s_add_u32 s10, s4, 16 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s11, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_add_u32 s0, s4, 64 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v9i64_to_v18i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] offset:48 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[20:21] offset:64 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v9i64_to_v18i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v18, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 +; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 +; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0 +; GFX11-NEXT: v_mov_b32_e32 v16, s0 +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: global_store_b128 v18, v[0:3], s[4:5] offset:48 +; GFX11-NEXT: global_store_b128 v18, v[4:7], s[4:5] offset:32 +; GFX11-NEXT: global_store_b128 v18, v[8:11], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v18, v[12:15], s[4:5] +; GFX11-NEXT: global_store_b64 v18, v[16:17], s[4:5] offset:64 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -744,9 +6311,163 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v10i64_to_v20i32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v10i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <10 x i64> %value) { +; GCN-LABEL: bitcast_v10i64_to_v20i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s6 +; GCN-NEXT: v_mov_b32_e32 v8, s6 +; GCN-NEXT: v_mov_b32_e32 v9, s6 +; GCN-NEXT: v_mov_b32_e32 v10, s6 +; GCN-NEXT: v_mov_b32_e32 v11, s6 +; GCN-NEXT: v_mov_b32_e32 v12, s6 +; GCN-NEXT: v_mov_b32_e32 v13, s6 +; GCN-NEXT: v_mov_b32_e32 v14, s6 +; GCN-NEXT: v_mov_b32_e32 v15, s6 +; GCN-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NEXT: v_mov_b32_e32 v17, s6 +; GCN-NEXT: v_mov_b32_e32 v18, s6 +; GCN-NEXT: v_mov_b32_e32 v19, s6 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v10i64_to_v20i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s18, s4, 48 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s19, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: s_add_u32 s14, s4, 32 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s15, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s14 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s15 +; VI-NEXT: s_add_u32 s14, s4, 16 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s15, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s14 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s15 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: s_add_u32 s0, s4, 64 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v10i64_to_v20i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:48 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:64 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v10i64_to_v20i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v20, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 +; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 +; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0 +; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: v_mov_b32_e32 v18, s0 +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] offset:48 +; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:32 +; GFX11-NEXT: global_store_b128 v20, v[8:11], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v20, v[12:15], s[4:5] +; GFX11-NEXT: global_store_b128 v20, v[16:19], s[4:5] offset:64 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -763,9 +6484,183 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v11i64_to_v20i32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v11i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <11 x i64> %value) { +; GCN-LABEL: bitcast_v11i64_to_v20i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s6 +; GCN-NEXT: v_mov_b32_e32 v8, s6 +; GCN-NEXT: v_mov_b32_e32 v9, s6 +; GCN-NEXT: v_mov_b32_e32 v10, s6 +; GCN-NEXT: v_mov_b32_e32 v11, s6 +; GCN-NEXT: v_mov_b32_e32 v12, s6 +; GCN-NEXT: v_mov_b32_e32 v13, s6 +; GCN-NEXT: v_mov_b32_e32 v14, s6 +; GCN-NEXT: v_mov_b32_e32 v15, s6 +; GCN-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NEXT: v_mov_b32_e32 v17, s6 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[16:17], off, s[0:3], 0 offset:80 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v11i64_to_v20i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s20, s4, 48 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s21, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_add_u32 s16, s4, 32 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s17, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: s_add_u32 s10, s4, 16 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s11, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_mov_b32 s6, s0 +; VI-NEXT: s_mov_b32 s7, s0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_add_u32 s0, s4, 0x50 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s4, 64 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v11i64_to_v20i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:48 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[24:25] offset:80 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:64 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v11i64_to_v20i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v22, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 +; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 +; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v21, s0 +; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v17, s0 +; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: v_mov_b32_e32 v18, s0 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: global_store_b128 v22, v[0:3], s[4:5] offset:48 +; GFX11-NEXT: global_store_b128 v22, v[4:7], s[4:5] offset:32 +; GFX11-NEXT: global_store_b128 v22, v[8:11], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v22, v[12:15], s[4:5] +; GFX11-NEXT: global_store_b64 v22, v[20:21], s[4:5] offset:80 +; GFX11-NEXT: global_store_b128 v22, v[16:19], s[4:5] offset:64 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -782,9 +6677,185 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v12i64_to_v22i32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v12i64_to_v22i32(i32 %cond, ptr addrspace(1) %out, <12 x i64> %value) { +; GCN-LABEL: bitcast_v12i64_to_v22i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s6 +; GCN-NEXT: v_mov_b32_e32 v8, s6 +; GCN-NEXT: v_mov_b32_e32 v9, s6 +; GCN-NEXT: v_mov_b32_e32 v10, s6 +; GCN-NEXT: v_mov_b32_e32 v11, s6 +; GCN-NEXT: v_mov_b32_e32 v12, s6 +; GCN-NEXT: v_mov_b32_e32 v13, s6 +; GCN-NEXT: v_mov_b32_e32 v14, s6 +; GCN-NEXT: v_mov_b32_e32 v15, s6 +; GCN-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NEXT: v_mov_b32_e32 v17, s6 +; GCN-NEXT: v_mov_b32_e32 v18, s6 +; GCN-NEXT: v_mov_b32_e32 v19, s6 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v12i64_to_v22i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s22, s4, 0x50 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s23, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s22 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s23 +; VI-NEXT: s_add_u32 s18, s4, 64 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s19, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: s_add_u32 s14, s4, 48 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s15, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s14 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s15 +; VI-NEXT: s_add_u32 s10, s4, 32 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s11, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s11 +; VI-NEXT: s_add_u32 s6, s4, 16 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s7, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v12i64_to_v22i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[26:27], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:80 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:64 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:48 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v12i64_to_v22i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 +; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 +; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0 +; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v21, s0 +; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v23, s0 +; GFX11-NEXT: v_mov_b32_e32 v22, s0 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: global_store_b128 v24, v[0:3], s[4:5] offset:80 +; GFX11-NEXT: global_store_b128 v24, v[4:7], s[4:5] offset:64 +; GFX11-NEXT: global_store_b128 v24, v[8:11], s[4:5] offset:48 +; GFX11-NEXT: global_store_b128 v24, v[12:15], s[4:5] offset:32 +; GFX11-NEXT: global_store_b128 v24, v[16:19], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v24, v[20:23], s[4:5] +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -801,9 +6872,204 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v13i64_to_v24i32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v13i64_to_v24i32(i32 %cond, ptr addrspace(1) %out, <13 x i64> %value) { +; GCN-LABEL: bitcast_v13i64_to_v24i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s6 +; GCN-NEXT: v_mov_b32_e32 v8, s6 +; GCN-NEXT: v_mov_b32_e32 v9, s6 +; GCN-NEXT: v_mov_b32_e32 v10, s6 +; GCN-NEXT: v_mov_b32_e32 v11, s6 +; GCN-NEXT: v_mov_b32_e32 v12, s6 +; GCN-NEXT: v_mov_b32_e32 v13, s6 +; GCN-NEXT: v_mov_b32_e32 v14, s6 +; GCN-NEXT: v_mov_b32_e32 v15, s6 +; GCN-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NEXT: v_mov_b32_e32 v17, s6 +; GCN-NEXT: v_mov_b32_e32 v18, s6 +; GCN-NEXT: v_mov_b32_e32 v19, s6 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:96 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v13i64_to_v24i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_add_u32 s24, s4, 0x50 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_addc_u32 s25, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s24 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s25 +; VI-NEXT: s_add_u32 s20, s4, 64 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s21, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_add_u32 s16, s4, 48 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s17, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: s_add_u32 s12, s4, 32 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s13, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: s_add_u32 s6, s4, 16 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s7, s5, 0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_add_u32 s0, s4, 0x60 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v13i64_to_v24i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[28:29], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:80 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:64 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:48 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[28:29] offset:96 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v13i64_to_v24i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v20, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 +; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 +; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0 +; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: v_mov_b32_e32 v18, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] offset:80 +; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:64 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: v_mov_b32_e32 v5, s0 +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: global_store_b128 v20, v[8:11], s[4:5] offset:48 +; GFX11-NEXT: global_store_b128 v20, v[12:15], s[4:5] offset:32 +; GFX11-NEXT: global_store_b128 v20, v[16:19], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b64 v20, v[4:5], s[4:5] offset:96 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -820,9 +7086,211 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v14i64_to_v26i32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v14i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <14 x i64> %value) { +; GCN-LABEL: bitcast_v14i64_to_v26i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s6 +; GCN-NEXT: v_mov_b32_e32 v8, s6 +; GCN-NEXT: v_mov_b32_e32 v9, s6 +; GCN-NEXT: v_mov_b32_e32 v10, s6 +; GCN-NEXT: v_mov_b32_e32 v11, s6 +; GCN-NEXT: v_mov_b32_e32 v12, s6 +; GCN-NEXT: v_mov_b32_e32 v13, s6 +; GCN-NEXT: v_mov_b32_e32 v14, s6 +; GCN-NEXT: v_mov_b32_e32 v15, s6 +; GCN-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NEXT: v_mov_b32_e32 v17, s6 +; GCN-NEXT: v_mov_b32_e32 v18, s6 +; GCN-NEXT: v_mov_b32_e32 v19, s6 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s6 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v14i64_to_v26i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s4, s0, 0x50 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s4, s0, 64 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s4, s0, 48 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s4, s0, 32 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s4, s0, 16 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s0, 0x60 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v14i64_to_v26i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:80 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:64 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:48 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:96 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v14i64_to_v26i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v20, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 +; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 +; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0 +; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: v_mov_b32_e32 v18, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] offset:80 +; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:64 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s0 +; GFX11-NEXT: v_mov_b32_e32 v7, s0 +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: global_store_b128 v20, v[8:11], s[4:5] offset:48 +; GFX11-NEXT: global_store_b128 v20, v[12:15], s[4:5] offset:32 +; GFX11-NEXT: global_store_b128 v20, v[16:19], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:96 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -839,9 +7307,231 @@ end: ret void } -; CHECK-LABEL: {{^}}bitcast_v15i64_to_v26i32: -; CHECK: ScratchSize: 0 + + define amdgpu_kernel void @bitcast_v15i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <15 x i64> %value) { +; GCN-LABEL: bitcast_v15i64_to_v26i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s6 +; GCN-NEXT: v_mov_b32_e32 v8, s6 +; GCN-NEXT: v_mov_b32_e32 v9, s6 +; GCN-NEXT: v_mov_b32_e32 v10, s6 +; GCN-NEXT: v_mov_b32_e32 v11, s6 +; GCN-NEXT: v_mov_b32_e32 v12, s6 +; GCN-NEXT: v_mov_b32_e32 v13, s6 +; GCN-NEXT: v_mov_b32_e32 v14, s6 +; GCN-NEXT: v_mov_b32_e32 v15, s6 +; GCN-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NEXT: v_mov_b32_e32 v17, s6 +; GCN-NEXT: v_mov_b32_e32 v18, s6 +; GCN-NEXT: v_mov_b32_e32 v19, s6 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GCN-NEXT: v_mov_b32_e32 v20, s6 +; GCN-NEXT: v_mov_b32_e32 v21, s6 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s6 +; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[20:21], off, s[0:3], 0 offset:112 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_v15i64_to_v26i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s2 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s4, s0, 0x50 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s4, s0, 64 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s4, s0, 48 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s4, s0, 32 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_add_u32 s4, s0, 16 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_mov_b32 s15, s2 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_add_u32 s2, s0, 0x70 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s0, s0, 0x60 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_v15i64_to_v26i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[34:35] offset:112 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_v15i64_to_v26i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v22, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 +; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 +; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0 +; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: v_mov_b32_e32 v18, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v22, v[0:3], s[4:5] offset:80 +; GFX11-NEXT: global_store_b128 v22, v[4:7], s[4:5] offset:64 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v20, s0 +; GFX11-NEXT: v_dual_mov_b32 v21, s0 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s0 +; GFX11-NEXT: v_mov_b32_e32 v7, s0 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: global_store_b128 v22, v[8:11], s[4:5] offset:48 +; GFX11-NEXT: global_store_b128 v22, v[12:15], s[4:5] offset:32 +; GFX11-NEXT: global_store_b128 v22, v[16:19], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v22, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b64 v22, v[20:21], s[4:5] offset:112 +; GFX11-NEXT: global_store_b128 v22, v[4:7], s[4:5] offset:96 +; GFX11-NEXT: s_endpgm entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -858,8 +7548,70 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v2bf16_to_i32: + define void @v_bitcast_v2bf16_to_i32(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v2bf16_to_i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB59_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GCN-NEXT: .LBB59_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v2bf16_to_i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v0, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v2bf16_to_i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v2bf16_to_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -874,8 +7626,70 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v2bf16_to_v2i16: + define void @v_bitcast_v2bf16_to_v2i16(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v2bf16_to_v2i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB60_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GCN-NEXT: .LBB60_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v2bf16_to_v2i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v0, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v2bf16_to_v2i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v2bf16_to_v2i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -890,8 +7704,77 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v2bf16_to_v2f16: + define void @v_bitcast_v2bf16_to_v2f16(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v2bf16_to_v2f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB61_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GCN-NEXT: .LBB61_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v3 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v2bf16_to_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v0, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v2bf16_to_v2f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v2bf16_to_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -906,8 +7789,70 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v2bf16_to_v4i8: + define void @v_bitcast_v2bf16_to_v4i8(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v2bf16_to_v4i8: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB62_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GCN-NEXT: .LBB62_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v2bf16_to_v4i8: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v0, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v2bf16_to_v4i8: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v2bf16_to_v4i8: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -922,8 +7867,91 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v3bf16_to_v3i16: + define void @v_bitcast_v3bf16_to_v3i16(i32 %cond, ptr addrspace(1) %out, <3 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v3bf16_to_v3i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: v_mov_b32_e32 v7, s5 +; GCN-NEXT: v_mov_b32_e32 v6, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB63_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v6, v0, v3, 16 +; GCN-NEXT: .LBB63_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: buffer_store_short v7, v[1:2], s[4:7], 0 addr64 offset:4 +; GCN-NEXT: buffer_store_dword v6, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v3bf16_to_v3i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_short v[3:4], v6 +; VI-NEXT: flat_store_dword v[1:2], v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v3bf16_to_v3i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_short v[1:2], v6, off offset:4 +; GFX9-NEXT: global_store_dword v[1:2], v5, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v3bf16_to_v3i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v[1:2], v6, off offset:4 +; GFX11-NEXT: global_store_b32 v[1:2], v5, off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -938,8 +7966,101 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v3bf16_to_v3f16: + define void @v_bitcast_v3bf16_to_v3f16(i32 %cond, ptr addrspace(1) %out, <3 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v3bf16_to_v3f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB64_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GCN-NEXT: .LBB64_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v3 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v3bf16_to_v3f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_short v[3:4], v6 +; VI-NEXT: flat_store_dword v[1:2], v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v3bf16_to_v3f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_short v[1:2], v6, off offset:4 +; GFX9-NEXT: global_store_dword v[1:2], v5, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v3bf16_to_v3f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v[1:2], v6, off offset:4 +; GFX11-NEXT: global_store_b32 v[1:2], v5, off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -954,8 +8075,73 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_i32_to_v2bf16: + define void @v_bitcast_i32_to_v2bf16(i32 %cond, ptr addrspace(1) %out, i32 %value) { +; GCN-LABEL: v_bitcast_i32_to_v2bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB65_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GCN-NEXT: .LBB65_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_i32_to_v2bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v0, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_i32_to_v2bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_i32_to_v2bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -970,8 +8156,73 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v2i16_to_v2bf16: + define void @v_bitcast_v2i16_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <2 x i16> %value) { +; GCN-LABEL: v_bitcast_v2i16_to_v2bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB66_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GCN-NEXT: .LBB66_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v2i16_to_v2bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v0, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v2i16_to_v2bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v2i16_to_v2bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -986,8 +8237,75 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v2f16_to_v2bf16: + define void @v_bitcast_v2f16_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <2 x half> %value) { +; GCN-LABEL: v_bitcast_v2f16_to_v2bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB67_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GCN-NEXT: .LBB67_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v2f16_to_v2bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v0, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v2f16_to_v2bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v2f16_to_v2bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1002,8 +8320,99 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4i8_to_v2bf16: + define void @v_bitcast_v4i8_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <4 x i8> %value) { +; GCN-LABEL: v_bitcast_v4i8_to_v2bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB68_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v0, v3, v0 +; GCN-NEXT: v_or_b32_e32 v7, v5, v4 +; GCN-NEXT: .LBB68_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4i8_to_v2bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6 +; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dword v[1:2], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4i8_to_v2bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 +; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4i8_to_v2bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_cbranch_execz .LBB68_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: .LBB68_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1018,8 +8427,95 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v3i16_to_v3bf16: + define void @v_bitcast_v3i16_to_v3bf16(i32 %cond, ptr addrspace(1) %out, <3 x i16> %value) { +; GCN-LABEL: v_bitcast_v3i16_to_v3bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB69_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; GCN-NEXT: .LBB69_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 offset:4 +; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v3i16_to_v3bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_short v[3:4], v6 +; VI-NEXT: flat_store_dword v[1:2], v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v3i16_to_v3bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_short v[1:2], v6, off offset:4 +; GFX9-NEXT: global_store_dword v[1:2], v5, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v3i16_to_v3bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v[1:2], v6, off offset:4 +; GFX11-NEXT: global_store_b32 v[1:2], v5, off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1034,8 +8530,100 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v4f16: + define void @v_bitcast_v4bf16_to_v4f16(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v4bf16_to_v4f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB70_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v5 +; GCN-NEXT: .LBB70_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v3, v0, v3 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4bf16_to_v4f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4bf16_to_v4f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4bf16_to_v4f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1050,8 +8638,86 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v4i16: + define void @v_bitcast_v4bf16_to_v4i16(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v4bf16_to_v4i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: v_mov_b32_e32 v8, s5 +; GCN-NEXT: v_mov_b32_e32 v7, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB71_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 +; GCN-NEXT: .LBB71_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4bf16_to_v4i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4bf16_to_v4i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4bf16_to_v4i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1066,8 +8732,86 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v2i32: + define void @v_bitcast_v4bf16_to_v2i32(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v4bf16_to_v2i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: v_mov_b32_e32 v8, s5 +; GCN-NEXT: v_mov_b32_e32 v7, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB72_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 +; GCN-NEXT: .LBB72_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4bf16_to_v2i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4bf16_to_v2i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4bf16_to_v2i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1082,8 +8826,86 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v2f32: + define void @v_bitcast_v4bf16_to_v2f32(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v4bf16_to_v2f32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: v_mov_b32_e32 v8, s5 +; GCN-NEXT: v_mov_b32_e32 v7, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB73_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 +; GCN-NEXT: .LBB73_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4bf16_to_v2f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4bf16_to_v2f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4bf16_to_v2f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1098,8 +8920,80 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_f64: + define void @v_bitcast_v4bf16_to_f64(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v4bf16_to_f64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB74_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 +; GCN-NEXT: .LBB74_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4bf16_to_f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: v_mov_b32_e32 v6, 0 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4bf16_to_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4bf16_to_f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1114,8 +9008,80 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_i64: + define void @v_bitcast_v4bf16_to_i64(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v4bf16_to_i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB75_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 +; GCN-NEXT: .LBB75_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4bf16_to_i64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: v_mov_b32_e32 v6, 0 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4bf16_to_i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4bf16_to_i64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1130,8 +9096,86 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v8i8: + define void @v_bitcast_v4bf16_to_v8i8(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v4bf16_to_v8i8: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: v_mov_b32_e32 v8, s5 +; GCN-NEXT: v_mov_b32_e32 v7, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB76_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 +; GCN-NEXT: .LBB76_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4bf16_to_v8i8: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4bf16_to_v8i8: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4bf16_to_v8i8: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1146,8 +9190,92 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_i64_to_v4bf16: + define void @v_bitcast_i64_to_v4bf16(i32 %cond, ptr addrspace(1) %out, i64 %value) { +; GCN-LABEL: v_bitcast_i64_to_v4bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB77_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: .LBB77_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v0, v5, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_i64_to_v4bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_i64_to_v4bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_i64_to_v4bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1162,8 +9290,92 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v2f32_to_v4bf16: + define void @v_bitcast_v2f32_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <2 x float> %value) { +; GCN-LABEL: v_bitcast_v2f32_to_v4bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB78_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: .LBB78_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v0, v5, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v2f32_to_v4bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v2f32_to_v4bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v2f32_to_v4bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1178,8 +9390,92 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v2i32_to_v4bf16: + define void @v_bitcast_v2i32_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <2 x i32> %value) { +; GCN-LABEL: v_bitcast_v2i32_to_v4bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB79_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: .LBB79_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v0, v5, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v2i32_to_v4bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v2i32_to_v4bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v2i32_to_v4bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1194,8 +9490,92 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4i16_to_v4bf16: + define void @v_bitcast_v4i16_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <4 x i16> %value) { +; GCN-LABEL: v_bitcast_v4i16_to_v4bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB80_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; GCN-NEXT: .LBB80_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v3, v3, v0, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4i16_to_v4bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4i16_to_v4bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4i16_to_v4bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1210,8 +9590,96 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4f16_to_v4bf16: + define void @v_bitcast_v4f16_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <4 x half> %value) { +; GCN-LABEL: v_bitcast_v4f16_to_v4bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB81_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GCN-NEXT: .LBB81_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v3, v3, v0, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4f16_to_v4bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4f16_to_v4bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4f16_to_v4bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1226,8 +9694,110 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v6bf16_to_v6i16: + define void @v_bitcast_v6bf16_to_v6i16(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v6bf16_to_v6i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: v_mov_b32_e32 v11, s6 +; GCN-NEXT: v_mov_b32_e32 v10, s5 +; GCN-NEXT: v_mov_b32_e32 v9, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB82_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v9, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v10, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v11, v6, v7, 16 +; GCN-NEXT: .LBB82_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dword v11, v[1:2], s[8:11], 0 addr64 offset:8 +; GCN-NEXT: buffer_store_dwordx2 v[9:10], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v6bf16_to_v6i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v9, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v9, v6 +; VI-NEXT: v_mov_b32_e32 v8, v5 +; VI-NEXT: v_mov_b32_e32 v7, v4 +; VI-NEXT: v_mov_b32_e32 v6, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v6bf16_to_v6i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-NEXT: v_mov_b32_e32 v6, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v6bf16_to_v6i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX11-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v9, v6 +; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1242,8 +9812,128 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v6bf16_to_v6f16: + define void @v_bitcast_v6bf16_to_v6f16(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v6bf16_to_v6f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB83_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GCN-NEXT: .LBB83_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v11 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_or_b32_e32 v4, v6, v5 +; GCN-NEXT: v_or_b32_e32 v0, v7, v0 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v6bf16_to_v6f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v9, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v9, v6 +; VI-NEXT: v_mov_b32_e32 v8, v5 +; VI-NEXT: v_mov_b32_e32 v7, v4 +; VI-NEXT: v_mov_b32_e32 v6, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v6bf16_to_v6f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-NEXT: v_mov_b32_e32 v6, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v6bf16_to_v6f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX11-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v9, v6 +; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1258,8 +9948,104 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v6bf16_to_v12i8: + define void @v_bitcast_v6bf16_to_v12i8(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v6bf16_to_v12i8: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: v_mov_b32_e32 v11, s6 +; GCN-NEXT: v_mov_b32_e32 v10, s5 +; GCN-NEXT: v_mov_b32_e32 v9, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB84_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v9, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v10, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v11, v6, v7, 16 +; GCN-NEXT: .LBB84_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dword v11, v[1:2], s[8:11], 0 addr64 offset:8 +; GCN-NEXT: buffer_store_dwordx2 v[9:10], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v6bf16_to_v12i8: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v8, v5 +; VI-NEXT: v_mov_b32_e32 v7, v4 +; VI-NEXT: v_mov_b32_e32 v6, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v6bf16_to_v12i8: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v8, s6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-NEXT: v_mov_b32_e32 v6, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v6bf16_to_v12i8: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v7, s1 +; GFX11-NEXT: v_mov_b32_e32 v6, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1274,8 +10060,122 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v6f16_to_v6bf16: + define void @v_bitcast_v6f16_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <6 x half> %value) { +; GCN-LABEL: v_bitcast_v6f16_to_v6bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB85_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GCN-NEXT: .LBB85_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v0, v0, v7, 16 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v6f16_to_v6bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v9, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v9, v6 +; VI-NEXT: v_mov_b32_e32 v8, v5 +; VI-NEXT: v_mov_b32_e32 v7, v4 +; VI-NEXT: v_mov_b32_e32 v6, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v6f16_to_v6bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-NEXT: v_mov_b32_e32 v6, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v6f16_to_v6bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX11-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v9, v6 +; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1290,8 +10190,116 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v6i16_to_v6bf16: + define void @v_bitcast_v6i16_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <6 x i16> %value) { +; GCN-LABEL: v_bitcast_v6i16_to_v6bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB86_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GCN-NEXT: .LBB86_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v0, v0, v7, 16 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v6i16_to_v6bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v9, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v9, v6 +; VI-NEXT: v_mov_b32_e32 v8, v5 +; VI-NEXT: v_mov_b32_e32 v7, v4 +; VI-NEXT: v_mov_b32_e32 v6, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v6i16_to_v6bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-NEXT: v_mov_b32_e32 v6, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v6i16_to_v6bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 +; GFX11-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v9, v6 +; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1306,8 +10314,183 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v12i8_to_v6bf16: + define void @v_bitcast_v12i8_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <12 x i8> %value) { +; GCN-LABEL: v_bitcast_v12i8_to_v6bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB87_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GCN-NEXT: v_or_b32_e32 v16, v3, v0 +; GCN-NEXT: v_or_b32_e32 v18, v5, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GCN-NEXT: v_or_b32_e32 v19, v9, v7 +; GCN-NEXT: v_or_b32_e32 v15, v11, v8 +; GCN-NEXT: v_or_b32_e32 v0, v13, v10 +; GCN-NEXT: .LBB87_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v17 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v0, v0, v7, 16 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v12i8_to_v6bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v18, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v17, s6 +; VI-NEXT: v_mov_b32_e32 v16, s5 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB87_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6 +; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v10 +; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v14 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: .LBB87_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx3 v[1:2], v[15:17] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v12i8_to_v6bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, s6 +; GFX9-NEXT: v_mov_b32_e32 v16, s5 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB87_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 +; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: .LBB87_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx3 v[1:2], v[15:17], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v12i8_to_v6bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v18, s3 :: v_dual_mov_b32 v17, s2 +; GFX11-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v15, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB87_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v8 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v10 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v12 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v14 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v11, v12 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v15, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v16, v3, v5 +; GFX11-NEXT: v_or_b32_e32 v17, v6, v7 +; GFX11-NEXT: .LBB87_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b96 v[1:2], v[15:17], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1322,8 +10505,116 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v2f64: + define void @v_bitcast_v8bf16_to_v2f64(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v8bf16_to_v2f64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: v_mov_b32_e32 v14, s7 +; GCN-NEXT: v_mov_b32_e32 v13, s6 +; GCN-NEXT: v_mov_b32_e32 v12, s5 +; GCN-NEXT: v_mov_b32_e32 v11, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB88_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16 +; GCN-NEXT: .LBB88_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8bf16_to_v2f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v10, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v10, v6 +; VI-NEXT: v_mov_b32_e32 v9, v5 +; VI-NEXT: v_mov_b32_e32 v8, v4 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8bf16_to_v2f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8bf16_to_v2f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1338,8 +10629,116 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v2i64: + define void @v_bitcast_v8bf16_to_v2i64(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v8bf16_to_v2i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: v_mov_b32_e32 v14, s7 +; GCN-NEXT: v_mov_b32_e32 v13, s6 +; GCN-NEXT: v_mov_b32_e32 v12, s5 +; GCN-NEXT: v_mov_b32_e32 v11, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB89_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16 +; GCN-NEXT: .LBB89_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8bf16_to_v2i64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v10, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v10, v6 +; VI-NEXT: v_mov_b32_e32 v9, v5 +; VI-NEXT: v_mov_b32_e32 v8, v4 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8bf16_to_v2i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8bf16_to_v2i64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1354,8 +10753,116 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v4f32: + define void @v_bitcast_v8bf16_to_v4f32(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v8bf16_to_v4f32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: v_mov_b32_e32 v14, s7 +; GCN-NEXT: v_mov_b32_e32 v13, s6 +; GCN-NEXT: v_mov_b32_e32 v12, s5 +; GCN-NEXT: v_mov_b32_e32 v11, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB90_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16 +; GCN-NEXT: .LBB90_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8bf16_to_v4f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v10, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v10, v6 +; VI-NEXT: v_mov_b32_e32 v9, v5 +; VI-NEXT: v_mov_b32_e32 v8, v4 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8bf16_to_v4f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8bf16_to_v4f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1370,8 +10877,116 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v4i32: + define void @v_bitcast_v8bf16_to_v4i32(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v8bf16_to_v4i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: v_mov_b32_e32 v14, s7 +; GCN-NEXT: v_mov_b32_e32 v13, s6 +; GCN-NEXT: v_mov_b32_e32 v12, s5 +; GCN-NEXT: v_mov_b32_e32 v11, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB91_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16 +; GCN-NEXT: .LBB91_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8bf16_to_v4i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v10, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v10, v6 +; VI-NEXT: v_mov_b32_e32 v9, v5 +; VI-NEXT: v_mov_b32_e32 v8, v4 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8bf16_to_v4i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8bf16_to_v4i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1386,8 +11001,140 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v8f16: + define void @v_bitcast_v8bf16_to_v8f16(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v8bf16_to_v8f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB92_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 +; GCN-NEXT: .LBB92_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_or_b32_e32 v4, v6, v5 +; GCN-NEXT: v_or_b32_e32 v5, v8, v7 +; GCN-NEXT: v_or_b32_e32 v6, v9, v0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8bf16_to_v8f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v10, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v10, v6 +; VI-NEXT: v_mov_b32_e32 v9, v5 +; VI-NEXT: v_mov_b32_e32 v8, v4 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8bf16_to_v8f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8bf16_to_v8f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1402,8 +11149,116 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v8i16: + define void @v_bitcast_v8bf16_to_v8i16(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v8bf16_to_v8i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: v_mov_b32_e32 v14, s7 +; GCN-NEXT: v_mov_b32_e32 v13, s6 +; GCN-NEXT: v_mov_b32_e32 v12, s5 +; GCN-NEXT: v_mov_b32_e32 v11, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB93_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16 +; GCN-NEXT: .LBB93_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8bf16_to_v8i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v10, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v10, v6 +; VI-NEXT: v_mov_b32_e32 v9, v5 +; VI-NEXT: v_mov_b32_e32 v8, v4 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8bf16_to_v8i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8bf16_to_v8i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1418,8 +11273,132 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v8f16_to_v8bf16: + define void @v_bitcast_v8f16_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <8 x half> %value) { +; GCN-LABEL: v_bitcast_v8f16_to_v8bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB94_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GCN-NEXT: .LBB94_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v0, v9, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8f16_to_v8bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v10, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v10, v6 +; VI-NEXT: v_mov_b32_e32 v9, v5 +; VI-NEXT: v_mov_b32_e32 v8, v4 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8f16_to_v8bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8f16_to_v8bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1434,8 +11413,124 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v8i16_to_v8bf16: + define void @v_bitcast_v8i16_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <8 x i16> %value) { +; GCN-LABEL: v_bitcast_v8i16_to_v8bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB95_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; GCN-NEXT: .LBB95_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v0, v9, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8i16_to_v8bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v10, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v10, v6 +; VI-NEXT: v_mov_b32_e32 v9, v5 +; VI-NEXT: v_mov_b32_e32 v8, v4 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8i16_to_v8bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8i16_to_v8bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1450,8 +11545,218 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v16i8_to_v8bf16: + define void @v_bitcast_v16i8_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <16 x i8> %value) { +; GCN-LABEL: v_bitcast_v16i8_to_v8bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB96_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v14 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v16 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GCN-NEXT: v_or_b32_e32 v12, v14, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; GCN-NEXT: v_or_b32_e32 v19, v3, v0 +; GCN-NEXT: v_or_b32_e32 v23, v5, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v6 +; GCN-NEXT: v_or_b32_e32 v24, v9, v7 +; GCN-NEXT: v_or_b32_e32 v21, v11, v8 +; GCN-NEXT: v_or_b32_e32 v25, v13, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; GCN-NEXT: v_or_b32_e32 v0, v17, v14 +; GCN-NEXT: .LBB96_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v0, v9, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v16i8_to_v8bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB96_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6 +; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v10 +; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v14 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v18 +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: .LBB96_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v16i8_to_v8bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB96_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 +; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: .LBB96_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v16i8_to_v8bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB96_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v8 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v12 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v9 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-NEXT: v_lshlrev_b16 v6, 8, v10 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v14 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v16 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v18 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_or_b32_e32 v19, v0, v3 +; GFX11-NEXT: v_or_b32_e32 v20, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v21, v6, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v22, v8, v9 +; GFX11-NEXT: .LBB96_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1466,8 +11771,124 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v2i64_to_v8bf16: + define void @v_bitcast_v2i64_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <2 x i64> %value) { +; GCN-LABEL: v_bitcast_v2i64_to_v8bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB97_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; GCN-NEXT: .LBB97_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v2i64_to_v8bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v10, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v10, v6 +; VI-NEXT: v_mov_b32_e32 v9, v5 +; VI-NEXT: v_mov_b32_e32 v8, v4 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v2i64_to_v8bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v2i64_to_v8bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1482,8 +11903,124 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v2f64_to_v8bf16: + define void @v_bitcast_v2f64_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <2 x double> %value) { +; GCN-LABEL: v_bitcast_v2f64_to_v8bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB98_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; GCN-NEXT: .LBB98_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v2f64_to_v8bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v10, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v10, v6 +; VI-NEXT: v_mov_b32_e32 v9, v5 +; VI-NEXT: v_mov_b32_e32 v8, v4 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v2f64_to_v8bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v2f64_to_v8bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1498,8 +12035,124 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4i32_to_v8bf16: + define void @v_bitcast_v4i32_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <4 x i32> %value) { +; GCN-LABEL: v_bitcast_v4i32_to_v8bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB99_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; GCN-NEXT: .LBB99_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4i32_to_v8bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v10, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v10, v6 +; VI-NEXT: v_mov_b32_e32 v9, v5 +; VI-NEXT: v_mov_b32_e32 v8, v4 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4i32_to_v8bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4i32_to_v8bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1514,8 +12167,124 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4f32_to_v8bf16: + define void @v_bitcast_v4f32_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <4 x float> %value) { +; GCN-LABEL: v_bitcast_v4f32_to_v8bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB100_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; GCN-NEXT: .LBB100_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4f32_to_v8bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v10, s7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v10, v6 +; VI-NEXT: v_mov_b32_e32 v9, v5 +; VI-NEXT: v_mov_b32_e32 v8, v4 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4f32_to_v8bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4f32_to_v8bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1530,8 +12299,180 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v16i16: + define void @v_bitcast_v16bf16_to_v16i16(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v16bf16_to_v16i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b32 s10, s4 +; GCN-NEXT: s_mov_b32 s11, s4 +; GCN-NEXT: v_mov_b32_e32 v26, s11 +; GCN-NEXT: v_mov_b32_e32 v25, s10 +; GCN-NEXT: v_mov_b32_e32 v24, s9 +; GCN-NEXT: v_mov_b32_e32 v23, s8 +; GCN-NEXT: v_mov_b32_e32 v22, s7 +; GCN-NEXT: v_mov_b32_e32 v21, s6 +; GCN-NEXT: v_mov_b32_e32 v20, s5 +; GCN-NEXT: v_mov_b32_e32 v19, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB101_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16 +; GCN-NEXT: .LBB101_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v16bf16_to_v16i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v18, v10 +; VI-NEXT: v_mov_b32_e32 v17, v9 +; VI-NEXT: v_mov_b32_e32 v16, v8 +; VI-NEXT: v_mov_b32_e32 v15, v7 +; VI-NEXT: v_mov_b32_e32 v14, v6 +; VI-NEXT: v_mov_b32_e32 v13, v5 +; VI-NEXT: v_mov_b32_e32 v12, v4 +; VI-NEXT: v_mov_b32_e32 v11, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v16bf16_to_v16i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v16bf16_to_v16i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v17, s6 +; GFX11-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v15, s4 +; GFX11-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v13, s2 +; GFX11-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 +; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 +; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1546,8 +12487,228 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v16f16: + define void @v_bitcast_v16bf16_to_v16f16(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v16bf16_to_v16f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB102_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v17 +; GCN-NEXT: .LBB102_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v31 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v29 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_or_b32_e32 v4, v6, v5 +; GCN-NEXT: v_or_b32_e32 v5, v8, v7 +; GCN-NEXT: v_or_b32_e32 v6, v10, v9 +; GCN-NEXT: v_or_b32_e32 v7, v12, v11 +; GCN-NEXT: v_or_b32_e32 v8, v14, v13 +; GCN-NEXT: v_or_b32_e32 v9, v16, v15 +; GCN-NEXT: v_or_b32_e32 v10, v17, v0 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v16bf16_to_v16f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v18, v10 +; VI-NEXT: v_mov_b32_e32 v17, v9 +; VI-NEXT: v_mov_b32_e32 v16, v8 +; VI-NEXT: v_mov_b32_e32 v15, v7 +; VI-NEXT: v_mov_b32_e32 v14, v6 +; VI-NEXT: v_mov_b32_e32 v13, v5 +; VI-NEXT: v_mov_b32_e32 v12, v4 +; VI-NEXT: v_mov_b32_e32 v11, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v16bf16_to_v16f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v16bf16_to_v16f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v17, s6 +; GFX11-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v15, s4 +; GFX11-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v13, s2 +; GFX11-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 +; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 +; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1562,8 +12723,180 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v8i32: + define void @v_bitcast_v16bf16_to_v8i32(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v16bf16_to_v8i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b32 s10, s4 +; GCN-NEXT: s_mov_b32 s11, s4 +; GCN-NEXT: v_mov_b32_e32 v26, s11 +; GCN-NEXT: v_mov_b32_e32 v25, s10 +; GCN-NEXT: v_mov_b32_e32 v24, s9 +; GCN-NEXT: v_mov_b32_e32 v23, s8 +; GCN-NEXT: v_mov_b32_e32 v22, s7 +; GCN-NEXT: v_mov_b32_e32 v21, s6 +; GCN-NEXT: v_mov_b32_e32 v20, s5 +; GCN-NEXT: v_mov_b32_e32 v19, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB103_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16 +; GCN-NEXT: .LBB103_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v16bf16_to_v8i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v18, v10 +; VI-NEXT: v_mov_b32_e32 v17, v9 +; VI-NEXT: v_mov_b32_e32 v16, v8 +; VI-NEXT: v_mov_b32_e32 v15, v7 +; VI-NEXT: v_mov_b32_e32 v14, v6 +; VI-NEXT: v_mov_b32_e32 v13, v5 +; VI-NEXT: v_mov_b32_e32 v12, v4 +; VI-NEXT: v_mov_b32_e32 v11, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v16bf16_to_v8i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v16bf16_to_v8i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v17, s6 +; GFX11-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v15, s4 +; GFX11-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v13, s2 +; GFX11-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 +; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 +; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1578,8 +12911,180 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v8f32: + define void @v_bitcast_v16bf16_to_v8f32(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v16bf16_to_v8f32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b32 s10, s4 +; GCN-NEXT: s_mov_b32 s11, s4 +; GCN-NEXT: v_mov_b32_e32 v26, s11 +; GCN-NEXT: v_mov_b32_e32 v25, s10 +; GCN-NEXT: v_mov_b32_e32 v24, s9 +; GCN-NEXT: v_mov_b32_e32 v23, s8 +; GCN-NEXT: v_mov_b32_e32 v22, s7 +; GCN-NEXT: v_mov_b32_e32 v21, s6 +; GCN-NEXT: v_mov_b32_e32 v20, s5 +; GCN-NEXT: v_mov_b32_e32 v19, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB104_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16 +; GCN-NEXT: .LBB104_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v16bf16_to_v8f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v18, v10 +; VI-NEXT: v_mov_b32_e32 v17, v9 +; VI-NEXT: v_mov_b32_e32 v16, v8 +; VI-NEXT: v_mov_b32_e32 v15, v7 +; VI-NEXT: v_mov_b32_e32 v14, v6 +; VI-NEXT: v_mov_b32_e32 v13, v5 +; VI-NEXT: v_mov_b32_e32 v12, v4 +; VI-NEXT: v_mov_b32_e32 v11, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v16bf16_to_v8f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v16bf16_to_v8f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v17, s6 +; GFX11-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v15, s4 +; GFX11-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v13, s2 +; GFX11-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 +; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 +; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1594,8 +13099,180 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v4f64: + define void @v_bitcast_v16bf16_to_v4f64(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v16bf16_to_v4f64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b32 s10, s4 +; GCN-NEXT: s_mov_b32 s11, s4 +; GCN-NEXT: v_mov_b32_e32 v26, s11 +; GCN-NEXT: v_mov_b32_e32 v25, s10 +; GCN-NEXT: v_mov_b32_e32 v24, s9 +; GCN-NEXT: v_mov_b32_e32 v23, s8 +; GCN-NEXT: v_mov_b32_e32 v22, s7 +; GCN-NEXT: v_mov_b32_e32 v21, s6 +; GCN-NEXT: v_mov_b32_e32 v20, s5 +; GCN-NEXT: v_mov_b32_e32 v19, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB105_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16 +; GCN-NEXT: .LBB105_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v16bf16_to_v4f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v18, v10 +; VI-NEXT: v_mov_b32_e32 v17, v9 +; VI-NEXT: v_mov_b32_e32 v16, v8 +; VI-NEXT: v_mov_b32_e32 v15, v7 +; VI-NEXT: v_mov_b32_e32 v14, v6 +; VI-NEXT: v_mov_b32_e32 v13, v5 +; VI-NEXT: v_mov_b32_e32 v12, v4 +; VI-NEXT: v_mov_b32_e32 v11, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v16bf16_to_v4f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v16bf16_to_v4f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v17, s6 +; GFX11-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v15, s4 +; GFX11-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v13, s2 +; GFX11-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 +; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 +; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1610,8 +13287,180 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v4i64: + define void @v_bitcast_v16bf16_to_v4i64(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v16bf16_to_v4i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b32 s10, s4 +; GCN-NEXT: s_mov_b32 s11, s4 +; GCN-NEXT: v_mov_b32_e32 v26, s11 +; GCN-NEXT: v_mov_b32_e32 v25, s10 +; GCN-NEXT: v_mov_b32_e32 v24, s9 +; GCN-NEXT: v_mov_b32_e32 v23, s8 +; GCN-NEXT: v_mov_b32_e32 v22, s7 +; GCN-NEXT: v_mov_b32_e32 v21, s6 +; GCN-NEXT: v_mov_b32_e32 v20, s5 +; GCN-NEXT: v_mov_b32_e32 v19, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB106_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16 +; GCN-NEXT: .LBB106_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v16bf16_to_v4i64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v18, v10 +; VI-NEXT: v_mov_b32_e32 v17, v9 +; VI-NEXT: v_mov_b32_e32 v16, v8 +; VI-NEXT: v_mov_b32_e32 v15, v7 +; VI-NEXT: v_mov_b32_e32 v14, v6 +; VI-NEXT: v_mov_b32_e32 v13, v5 +; VI-NEXT: v_mov_b32_e32 v12, v4 +; VI-NEXT: v_mov_b32_e32 v11, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v16bf16_to_v4i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v16bf16_to_v4i64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v17, s6 +; GFX11-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v15, s4 +; GFX11-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v13, s2 +; GFX11-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 +; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 +; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1626,8 +13475,180 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v32i8: + define void @v_bitcast_v16bf16_to_v32i8(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v16bf16_to_v32i8: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b32 s10, s4 +; GCN-NEXT: s_mov_b32 s11, s4 +; GCN-NEXT: v_mov_b32_e32 v26, s11 +; GCN-NEXT: v_mov_b32_e32 v25, s10 +; GCN-NEXT: v_mov_b32_e32 v24, s9 +; GCN-NEXT: v_mov_b32_e32 v23, s8 +; GCN-NEXT: v_mov_b32_e32 v22, s7 +; GCN-NEXT: v_mov_b32_e32 v21, s6 +; GCN-NEXT: v_mov_b32_e32 v20, s5 +; GCN-NEXT: v_mov_b32_e32 v19, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB107_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16 +; GCN-NEXT: .LBB107_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v16bf16_to_v32i8: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v18, v10 +; VI-NEXT: v_mov_b32_e32 v17, v9 +; VI-NEXT: v_mov_b32_e32 v16, v8 +; VI-NEXT: v_mov_b32_e32 v15, v7 +; VI-NEXT: v_mov_b32_e32 v14, v6 +; VI-NEXT: v_mov_b32_e32 v13, v5 +; VI-NEXT: v_mov_b32_e32 v12, v4 +; VI-NEXT: v_mov_b32_e32 v11, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v16bf16_to_v32i8: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v16bf16_to_v32i8: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v17, s6 +; GFX11-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v15, s4 +; GFX11-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v13, s2 +; GFX11-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 +; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 +; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1642,8 +13663,196 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v8f32_to_v16bf16: + define void @v_bitcast_v8f32_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <8 x float> %value) { +; GCN-LABEL: v_bitcast_v8f32_to_v16bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB108_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; GCN-NEXT: .LBB108_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16 +; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16 +; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8f32_to_v16bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v18, v10 +; VI-NEXT: v_mov_b32_e32 v17, v9 +; VI-NEXT: v_mov_b32_e32 v16, v8 +; VI-NEXT: v_mov_b32_e32 v15, v7 +; VI-NEXT: v_mov_b32_e32 v14, v6 +; VI-NEXT: v_mov_b32_e32 v13, v5 +; VI-NEXT: v_mov_b32_e32 v12, v4 +; VI-NEXT: v_mov_b32_e32 v11, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8f32_to_v16bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8f32_to_v16bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v17, s6 +; GFX11-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v15, s4 +; GFX11-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v13, s2 +; GFX11-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 +; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 +; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1658,8 +13867,196 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v8i32_to_v16bf16: + define void @v_bitcast_v8i32_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <8 x i32> %value) { +; GCN-LABEL: v_bitcast_v8i32_to_v16bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB109_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; GCN-NEXT: .LBB109_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16 +; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16 +; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8i32_to_v16bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v18, v10 +; VI-NEXT: v_mov_b32_e32 v17, v9 +; VI-NEXT: v_mov_b32_e32 v16, v8 +; VI-NEXT: v_mov_b32_e32 v15, v7 +; VI-NEXT: v_mov_b32_e32 v14, v6 +; VI-NEXT: v_mov_b32_e32 v13, v5 +; VI-NEXT: v_mov_b32_e32 v12, v4 +; VI-NEXT: v_mov_b32_e32 v11, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8i32_to_v16bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8i32_to_v16bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v17, s6 +; GFX11-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v15, s4 +; GFX11-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v13, s2 +; GFX11-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 +; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 +; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1674,8 +14071,196 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4i64_to_v16bf16: + define void @v_bitcast_v4i64_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <4 x i64> %value) { +; GCN-LABEL: v_bitcast_v4i64_to_v16bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB110_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; GCN-NEXT: .LBB110_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16 +; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16 +; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4i64_to_v16bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v18, v10 +; VI-NEXT: v_mov_b32_e32 v17, v9 +; VI-NEXT: v_mov_b32_e32 v16, v8 +; VI-NEXT: v_mov_b32_e32 v15, v7 +; VI-NEXT: v_mov_b32_e32 v14, v6 +; VI-NEXT: v_mov_b32_e32 v13, v5 +; VI-NEXT: v_mov_b32_e32 v12, v4 +; VI-NEXT: v_mov_b32_e32 v11, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4i64_to_v16bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4i64_to_v16bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v17, s6 +; GFX11-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v15, s4 +; GFX11-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v13, s2 +; GFX11-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 +; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 +; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1690,8 +14275,196 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v4f64_to_v16bf16: + define void @v_bitcast_v4f64_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <4 x double> %value) { +; GCN-LABEL: v_bitcast_v4f64_to_v16bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB111_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; GCN-NEXT: .LBB111_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16 +; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16 +; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v4f64_to_v16bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v14, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v18, v10 +; VI-NEXT: v_mov_b32_e32 v17, v9 +; VI-NEXT: v_mov_b32_e32 v16, v8 +; VI-NEXT: v_mov_b32_e32 v15, v7 +; VI-NEXT: v_mov_b32_e32 v14, v6 +; VI-NEXT: v_mov_b32_e32 v13, v5 +; VI-NEXT: v_mov_b32_e32 v12, v4 +; VI-NEXT: v_mov_b32_e32 v11, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v4f64_to_v16bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v16, v8 +; GFX9-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v4f64_to_v16bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v17, s6 +; GFX11-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v15, s4 +; GFX11-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v13, s2 +; GFX11-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 +; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 +; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1706,8 +14479,394 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v32i8_to_v16bf16: + define void @v_bitcast_v32i8_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <32 x i8> %value) { +; GCN-LABEL: v_bitcast_v32i8_to_v16bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: v_mov_b32_e32 v35, 0 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB112_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v14 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v16 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v20 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v24 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v26 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v28 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v37 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 24, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GCN-NEXT: v_or_b32_e32 v12, v14, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20 +; GCN-NEXT: v_or_b32_e32 v18, v22, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v28 +; GCN-NEXT: v_or_b32_e32 v24, v30, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 +; GCN-NEXT: v_or_b32_e32 v50, v3, v0 +; GCN-NEXT: v_or_b32_e32 v54, v5, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v6 +; GCN-NEXT: v_or_b32_e32 v55, v9, v7 +; GCN-NEXT: v_or_b32_e32 v52, v11, v8 +; GCN-NEXT: v_or_b32_e32 v40, v13, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v12 +; GCN-NEXT: v_or_b32_e32 v41, v17, v14 +; GCN-NEXT: v_or_b32_e32 v33, v19, v15 +; GCN-NEXT: v_or_b32_e32 v39, v21, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; GCN-NEXT: v_or_b32_e32 v48, v25, v20 +; GCN-NEXT: v_or_b32_e32 v35, v27, v22 +; GCN-NEXT: v_or_b32_e32 v49, v29, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v24 +; GCN-NEXT: v_or_b32_e32 v0, v31, v26 +; GCN-NEXT: .LBB112_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v40 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v53 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v33 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v48 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v34 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 +; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 +; GCN-NEXT: v_alignbit_b32 v10, v0, v17, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32i8_to_v16bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: v_mov_b32_e32 v38, s11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v37, s10 +; VI-NEXT: v_mov_b32_e32 v36, s9 +; VI-NEXT: v_mov_b32_e32 v35, s8 +; VI-NEXT: v_mov_b32_e32 v34, s7 +; VI-NEXT: v_mov_b32_e32 v33, s6 +; VI-NEXT: v_mov_b32_e32 v32, s5 +; VI-NEXT: v_mov_b32_e32 v31, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB112_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6 +; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v10 +; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v14 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v18 +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v22 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v26 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v30 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: .LBB112_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[35:38] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[31:34] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32i8_to_v16bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: v_mov_b32_e32 v38, s11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v37, s10 +; GFX9-NEXT: v_mov_b32_e32 v36, s9 +; GFX9-NEXT: v_mov_b32_e32 v35, s8 +; GFX9-NEXT: v_mov_b32_e32 v34, s7 +; GFX9-NEXT: v_mov_b32_e32 v33, s6 +; GFX9-NEXT: v_mov_b32_e32 v32, s5 +; GFX9-NEXT: v_mov_b32_e32 v31, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB112_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 +; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v31, v3, v0, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v32, v3, v0, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v33, v3, v0, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v34, v3, v0, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v35, v3, v0, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v36, v3, v0, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v30 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v37, v3, v0, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v48 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v38, v3, v0, s6 +; GFX9-NEXT: .LBB112_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32i8_to_v16bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u16 v50, off, s32 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v38, s7 :: v_dual_mov_b32 v37, s6 +; GFX11-NEXT: v_dual_mov_b32 v36, s5 :: v_dual_mov_b32 v35, s4 +; GFX11-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v33, s2 +; GFX11-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v31, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB112_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v8 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v10 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v12 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v14 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v11, v12 +; GFX11-NEXT: v_perm_b32 v31, v4, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v15 +; GFX11-NEXT: v_perm_b32 v32, v5, v3, 0x5040100 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v16 +; GFX11-NEXT: v_perm_b32 v33, v7, v6, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v18 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v19 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v20 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v21 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v22 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v24 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v26 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v28 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v50 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v49 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v48 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v39 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_perm_b32 v34, v3, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v35, v5, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v36, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v37, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v38, v11, v10, 0x5040100 +; GFX11-NEXT: .LBB112_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1722,8 +14881,330 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v8i64: + define void @v_bitcast_v32bf16_to_v8i64(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v32bf16_to_v8i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b32 s10, s4 +; GCN-NEXT: s_mov_b32 s11, s4 +; GCN-NEXT: s_mov_b32 s12, s4 +; GCN-NEXT: s_mov_b32 s13, s4 +; GCN-NEXT: s_mov_b32 s14, s4 +; GCN-NEXT: s_mov_b32 s15, s4 +; GCN-NEXT: s_mov_b32 s16, s4 +; GCN-NEXT: s_mov_b32 s17, s4 +; GCN-NEXT: s_mov_b32 s18, s4 +; GCN-NEXT: s_mov_b32 s19, s4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v46, s19 +; GCN-NEXT: v_mov_b32_e32 v45, s18 +; GCN-NEXT: v_mov_b32_e32 v44, s17 +; GCN-NEXT: v_mov_b32_e32 v43, s16 +; GCN-NEXT: v_mov_b32_e32 v42, s15 +; GCN-NEXT: v_mov_b32_e32 v41, s14 +; GCN-NEXT: v_mov_b32_e32 v40, s13 +; GCN-NEXT: v_mov_b32_e32 v39, s12 +; GCN-NEXT: v_mov_b32_e32 v38, s11 +; GCN-NEXT: v_mov_b32_e32 v37, s10 +; GCN-NEXT: v_mov_b32_e32 v36, s9 +; GCN-NEXT: v_mov_b32_e32 v35, s8 +; GCN-NEXT: v_mov_b32_e32 v34, s7 +; GCN-NEXT: v_mov_b32_e32 v33, s6 +; GCN-NEXT: v_mov_b32_e32 v32, s5 +; GCN-NEXT: v_mov_b32_e32 v31, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB113_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v51 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16 +; GCN-NEXT: v_alignbit_b32 v39, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v40, v20, v21, 16 +; GCN-NEXT: v_alignbit_b32 v41, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v42, v24, v25, 16 +; GCN-NEXT: v_alignbit_b32 v43, v26, v27, 16 +; GCN-NEXT: v_alignbit_b32 v44, v28, v29, 16 +; GCN-NEXT: v_alignbit_b32 v45, v30, v50, 16 +; GCN-NEXT: v_alignbit_b32 v46, v49, v48, 16 +; GCN-NEXT: .LBB113_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[8:11], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[8:11], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32bf16_to_v8i64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v33, s18 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v29, s14 +; VI-NEXT: v_mov_b32_e32 v28, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v26, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB113_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v34, v18 +; VI-NEXT: v_mov_b32_e32 v33, v17 +; VI-NEXT: v_mov_b32_e32 v32, v16 +; VI-NEXT: v_mov_b32_e32 v31, v15 +; VI-NEXT: v_mov_b32_e32 v30, v14 +; VI-NEXT: v_mov_b32_e32 v29, v13 +; VI-NEXT: v_mov_b32_e32 v28, v12 +; VI-NEXT: v_mov_b32_e32 v27, v11 +; VI-NEXT: v_mov_b32_e32 v26, v10 +; VI-NEXT: v_mov_b32_e32 v25, v9 +; VI-NEXT: v_mov_b32_e32 v24, v8 +; VI-NEXT: v_mov_b32_e32 v23, v7 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v5 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: .LBB113_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32bf16_to_v8i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s17 +; GFX9-NEXT: v_mov_b32_e32 v31, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v29, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v27, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s11 +; GFX9-NEXT: v_mov_b32_e32 v25, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB113_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v34, v18 +; GFX9-NEXT: v_mov_b32_e32 v33, v17 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: .LBB113_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32bf16_to_v8i64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v34, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-NEXT: v_dual_mov_b32 v32, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-NEXT: v_dual_mov_b32 v30, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-NEXT: v_dual_mov_b32 v28, s9 :: v_dual_mov_b32 v27, s8 +; GFX11-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v25, s6 +; GFX11-NEXT: v_dual_mov_b32 v24, s5 :: v_dual_mov_b32 v23, s4 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB113_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 +; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 +; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 +; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 +; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 +; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: .LBB113_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1738,8 +15219,330 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v8f64: + define void @v_bitcast_v32bf16_to_v8f64(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v32bf16_to_v8f64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b32 s10, s4 +; GCN-NEXT: s_mov_b32 s11, s4 +; GCN-NEXT: s_mov_b32 s12, s4 +; GCN-NEXT: s_mov_b32 s13, s4 +; GCN-NEXT: s_mov_b32 s14, s4 +; GCN-NEXT: s_mov_b32 s15, s4 +; GCN-NEXT: s_mov_b32 s16, s4 +; GCN-NEXT: s_mov_b32 s17, s4 +; GCN-NEXT: s_mov_b32 s18, s4 +; GCN-NEXT: s_mov_b32 s19, s4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v46, s19 +; GCN-NEXT: v_mov_b32_e32 v45, s18 +; GCN-NEXT: v_mov_b32_e32 v44, s17 +; GCN-NEXT: v_mov_b32_e32 v43, s16 +; GCN-NEXT: v_mov_b32_e32 v42, s15 +; GCN-NEXT: v_mov_b32_e32 v41, s14 +; GCN-NEXT: v_mov_b32_e32 v40, s13 +; GCN-NEXT: v_mov_b32_e32 v39, s12 +; GCN-NEXT: v_mov_b32_e32 v38, s11 +; GCN-NEXT: v_mov_b32_e32 v37, s10 +; GCN-NEXT: v_mov_b32_e32 v36, s9 +; GCN-NEXT: v_mov_b32_e32 v35, s8 +; GCN-NEXT: v_mov_b32_e32 v34, s7 +; GCN-NEXT: v_mov_b32_e32 v33, s6 +; GCN-NEXT: v_mov_b32_e32 v32, s5 +; GCN-NEXT: v_mov_b32_e32 v31, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB114_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v51 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16 +; GCN-NEXT: v_alignbit_b32 v39, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v40, v20, v21, 16 +; GCN-NEXT: v_alignbit_b32 v41, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v42, v24, v25, 16 +; GCN-NEXT: v_alignbit_b32 v43, v26, v27, 16 +; GCN-NEXT: v_alignbit_b32 v44, v28, v29, 16 +; GCN-NEXT: v_alignbit_b32 v45, v30, v50, 16 +; GCN-NEXT: v_alignbit_b32 v46, v49, v48, 16 +; GCN-NEXT: .LBB114_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[8:11], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[8:11], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32bf16_to_v8f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v33, s18 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v29, s14 +; VI-NEXT: v_mov_b32_e32 v28, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v26, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB114_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v34, v18 +; VI-NEXT: v_mov_b32_e32 v33, v17 +; VI-NEXT: v_mov_b32_e32 v32, v16 +; VI-NEXT: v_mov_b32_e32 v31, v15 +; VI-NEXT: v_mov_b32_e32 v30, v14 +; VI-NEXT: v_mov_b32_e32 v29, v13 +; VI-NEXT: v_mov_b32_e32 v28, v12 +; VI-NEXT: v_mov_b32_e32 v27, v11 +; VI-NEXT: v_mov_b32_e32 v26, v10 +; VI-NEXT: v_mov_b32_e32 v25, v9 +; VI-NEXT: v_mov_b32_e32 v24, v8 +; VI-NEXT: v_mov_b32_e32 v23, v7 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v5 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: .LBB114_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32bf16_to_v8f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s17 +; GFX9-NEXT: v_mov_b32_e32 v31, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v29, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v27, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s11 +; GFX9-NEXT: v_mov_b32_e32 v25, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB114_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v34, v18 +; GFX9-NEXT: v_mov_b32_e32 v33, v17 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: .LBB114_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32bf16_to_v8f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v34, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-NEXT: v_dual_mov_b32 v32, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-NEXT: v_dual_mov_b32 v30, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-NEXT: v_dual_mov_b32 v28, s9 :: v_dual_mov_b32 v27, s8 +; GFX11-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v25, s6 +; GFX11-NEXT: v_dual_mov_b32 v24, s5 :: v_dual_mov_b32 v23, s4 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB114_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 +; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 +; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 +; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 +; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 +; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: .LBB114_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1754,8 +15557,330 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v16i32: + define void @v_bitcast_v32bf16_to_v16i32(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v32bf16_to_v16i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b32 s10, s4 +; GCN-NEXT: s_mov_b32 s11, s4 +; GCN-NEXT: s_mov_b32 s12, s4 +; GCN-NEXT: s_mov_b32 s13, s4 +; GCN-NEXT: s_mov_b32 s14, s4 +; GCN-NEXT: s_mov_b32 s15, s4 +; GCN-NEXT: s_mov_b32 s16, s4 +; GCN-NEXT: s_mov_b32 s17, s4 +; GCN-NEXT: s_mov_b32 s18, s4 +; GCN-NEXT: s_mov_b32 s19, s4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v46, s19 +; GCN-NEXT: v_mov_b32_e32 v45, s18 +; GCN-NEXT: v_mov_b32_e32 v44, s17 +; GCN-NEXT: v_mov_b32_e32 v43, s16 +; GCN-NEXT: v_mov_b32_e32 v42, s15 +; GCN-NEXT: v_mov_b32_e32 v41, s14 +; GCN-NEXT: v_mov_b32_e32 v40, s13 +; GCN-NEXT: v_mov_b32_e32 v39, s12 +; GCN-NEXT: v_mov_b32_e32 v38, s11 +; GCN-NEXT: v_mov_b32_e32 v37, s10 +; GCN-NEXT: v_mov_b32_e32 v36, s9 +; GCN-NEXT: v_mov_b32_e32 v35, s8 +; GCN-NEXT: v_mov_b32_e32 v34, s7 +; GCN-NEXT: v_mov_b32_e32 v33, s6 +; GCN-NEXT: v_mov_b32_e32 v32, s5 +; GCN-NEXT: v_mov_b32_e32 v31, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB115_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v51 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16 +; GCN-NEXT: v_alignbit_b32 v39, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v40, v20, v21, 16 +; GCN-NEXT: v_alignbit_b32 v41, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v42, v24, v25, 16 +; GCN-NEXT: v_alignbit_b32 v43, v26, v27, 16 +; GCN-NEXT: v_alignbit_b32 v44, v28, v29, 16 +; GCN-NEXT: v_alignbit_b32 v45, v30, v50, 16 +; GCN-NEXT: v_alignbit_b32 v46, v49, v48, 16 +; GCN-NEXT: .LBB115_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[8:11], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[8:11], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32bf16_to_v16i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v33, s18 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v29, s14 +; VI-NEXT: v_mov_b32_e32 v28, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v26, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB115_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v34, v18 +; VI-NEXT: v_mov_b32_e32 v33, v17 +; VI-NEXT: v_mov_b32_e32 v32, v16 +; VI-NEXT: v_mov_b32_e32 v31, v15 +; VI-NEXT: v_mov_b32_e32 v30, v14 +; VI-NEXT: v_mov_b32_e32 v29, v13 +; VI-NEXT: v_mov_b32_e32 v28, v12 +; VI-NEXT: v_mov_b32_e32 v27, v11 +; VI-NEXT: v_mov_b32_e32 v26, v10 +; VI-NEXT: v_mov_b32_e32 v25, v9 +; VI-NEXT: v_mov_b32_e32 v24, v8 +; VI-NEXT: v_mov_b32_e32 v23, v7 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v5 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: .LBB115_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32bf16_to_v16i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s17 +; GFX9-NEXT: v_mov_b32_e32 v31, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v29, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v27, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s11 +; GFX9-NEXT: v_mov_b32_e32 v25, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB115_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v34, v18 +; GFX9-NEXT: v_mov_b32_e32 v33, v17 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: .LBB115_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32bf16_to_v16i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v34, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-NEXT: v_dual_mov_b32 v32, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-NEXT: v_dual_mov_b32 v30, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-NEXT: v_dual_mov_b32 v28, s9 :: v_dual_mov_b32 v27, s8 +; GFX11-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v25, s6 +; GFX11-NEXT: v_dual_mov_b32 v24, s5 :: v_dual_mov_b32 v23, s4 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB115_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 +; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 +; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 +; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 +; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 +; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: .LBB115_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1770,8 +15895,330 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v16f32: + define void @v_bitcast_v32bf16_to_v16f32(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v32bf16_to_v16f32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b32 s10, s4 +; GCN-NEXT: s_mov_b32 s11, s4 +; GCN-NEXT: s_mov_b32 s12, s4 +; GCN-NEXT: s_mov_b32 s13, s4 +; GCN-NEXT: s_mov_b32 s14, s4 +; GCN-NEXT: s_mov_b32 s15, s4 +; GCN-NEXT: s_mov_b32 s16, s4 +; GCN-NEXT: s_mov_b32 s17, s4 +; GCN-NEXT: s_mov_b32 s18, s4 +; GCN-NEXT: s_mov_b32 s19, s4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v46, s19 +; GCN-NEXT: v_mov_b32_e32 v45, s18 +; GCN-NEXT: v_mov_b32_e32 v44, s17 +; GCN-NEXT: v_mov_b32_e32 v43, s16 +; GCN-NEXT: v_mov_b32_e32 v42, s15 +; GCN-NEXT: v_mov_b32_e32 v41, s14 +; GCN-NEXT: v_mov_b32_e32 v40, s13 +; GCN-NEXT: v_mov_b32_e32 v39, s12 +; GCN-NEXT: v_mov_b32_e32 v38, s11 +; GCN-NEXT: v_mov_b32_e32 v37, s10 +; GCN-NEXT: v_mov_b32_e32 v36, s9 +; GCN-NEXT: v_mov_b32_e32 v35, s8 +; GCN-NEXT: v_mov_b32_e32 v34, s7 +; GCN-NEXT: v_mov_b32_e32 v33, s6 +; GCN-NEXT: v_mov_b32_e32 v32, s5 +; GCN-NEXT: v_mov_b32_e32 v31, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB116_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v51 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16 +; GCN-NEXT: v_alignbit_b32 v39, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v40, v20, v21, 16 +; GCN-NEXT: v_alignbit_b32 v41, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v42, v24, v25, 16 +; GCN-NEXT: v_alignbit_b32 v43, v26, v27, 16 +; GCN-NEXT: v_alignbit_b32 v44, v28, v29, 16 +; GCN-NEXT: v_alignbit_b32 v45, v30, v50, 16 +; GCN-NEXT: v_alignbit_b32 v46, v49, v48, 16 +; GCN-NEXT: .LBB116_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[8:11], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[8:11], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32bf16_to_v16f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v33, s18 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v29, s14 +; VI-NEXT: v_mov_b32_e32 v28, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v26, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB116_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v34, v18 +; VI-NEXT: v_mov_b32_e32 v33, v17 +; VI-NEXT: v_mov_b32_e32 v32, v16 +; VI-NEXT: v_mov_b32_e32 v31, v15 +; VI-NEXT: v_mov_b32_e32 v30, v14 +; VI-NEXT: v_mov_b32_e32 v29, v13 +; VI-NEXT: v_mov_b32_e32 v28, v12 +; VI-NEXT: v_mov_b32_e32 v27, v11 +; VI-NEXT: v_mov_b32_e32 v26, v10 +; VI-NEXT: v_mov_b32_e32 v25, v9 +; VI-NEXT: v_mov_b32_e32 v24, v8 +; VI-NEXT: v_mov_b32_e32 v23, v7 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v5 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: .LBB116_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32bf16_to_v16f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s17 +; GFX9-NEXT: v_mov_b32_e32 v31, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v29, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v27, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s11 +; GFX9-NEXT: v_mov_b32_e32 v25, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB116_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v34, v18 +; GFX9-NEXT: v_mov_b32_e32 v33, v17 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: .LBB116_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32bf16_to_v16f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v34, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-NEXT: v_dual_mov_b32 v32, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-NEXT: v_dual_mov_b32 v30, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-NEXT: v_dual_mov_b32 v28, s9 :: v_dual_mov_b32 v27, s8 +; GFX11-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v25, s6 +; GFX11-NEXT: v_dual_mov_b32 v24, s5 :: v_dual_mov_b32 v23, s4 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB116_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 +; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 +; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 +; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 +; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 +; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: .LBB116_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1786,8 +16233,448 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v32f16: + define void @v_bitcast_v32bf16_to_v32f16(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v32bf16_to_v32f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v46, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v60, 0 +; GCN-NEXT: v_mov_b32_e32 v56, 0 +; GCN-NEXT: v_mov_b32_e32 v61, 0 +; GCN-NEXT: v_mov_b32_e32 v57, 0 +; GCN-NEXT: v_mov_b32_e32 v62, 0 +; GCN-NEXT: v_mov_b32_e32 v58, 0 +; GCN-NEXT: v_mov_b32_e32 v63, 0 +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: v_mov_b32_e32 v45, 0 +; GCN-NEXT: v_mov_b32_e32 v38, 0 +; GCN-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NEXT: v_mov_b32_e32 v35, 0 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NEXT: v_mov_b32_e32 v37, 0 +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB117_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v47 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v59 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v61, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v57, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v62, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v58, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v63, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v29 +; GCN-NEXT: .LBB117_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v60 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v46 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v61 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v56 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v62 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v57 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v63 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v58 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v42 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v54 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v43 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v55 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v44 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v45 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v41 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v51 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v52 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v48 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v53 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v49 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v35 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v31 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v36 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v37 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_or_b32_e32 v4, v6, v5 +; GCN-NEXT: v_or_b32_e32 v5, v8, v7 +; GCN-NEXT: v_or_b32_e32 v6, v10, v9 +; GCN-NEXT: v_or_b32_e32 v7, v12, v11 +; GCN-NEXT: v_or_b32_e32 v8, v14, v13 +; GCN-NEXT: v_or_b32_e32 v9, v16, v15 +; GCN-NEXT: v_or_b32_e32 v10, v18, v17 +; GCN-NEXT: v_or_b32_e32 v11, v20, v19 +; GCN-NEXT: v_or_b32_e32 v12, v22, v21 +; GCN-NEXT: v_or_b32_e32 v13, v24, v23 +; GCN-NEXT: v_or_b32_e32 v14, v26, v25 +; GCN-NEXT: v_or_b32_e32 v15, v28, v27 +; GCN-NEXT: v_or_b32_e32 v16, v30, v29 +; GCN-NEXT: v_or_b32_e32 v17, v32, v31 +; GCN-NEXT: v_or_b32_e32 v18, v33, v0 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32bf16_to_v32f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v33, s18 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v29, s14 +; VI-NEXT: v_mov_b32_e32 v28, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v26, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB117_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v34, v18 +; VI-NEXT: v_mov_b32_e32 v33, v17 +; VI-NEXT: v_mov_b32_e32 v32, v16 +; VI-NEXT: v_mov_b32_e32 v31, v15 +; VI-NEXT: v_mov_b32_e32 v30, v14 +; VI-NEXT: v_mov_b32_e32 v29, v13 +; VI-NEXT: v_mov_b32_e32 v28, v12 +; VI-NEXT: v_mov_b32_e32 v27, v11 +; VI-NEXT: v_mov_b32_e32 v26, v10 +; VI-NEXT: v_mov_b32_e32 v25, v9 +; VI-NEXT: v_mov_b32_e32 v24, v8 +; VI-NEXT: v_mov_b32_e32 v23, v7 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v5 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: .LBB117_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32bf16_to_v32f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s17 +; GFX9-NEXT: v_mov_b32_e32 v31, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v29, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v27, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s11 +; GFX9-NEXT: v_mov_b32_e32 v25, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB117_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v34, v18 +; GFX9-NEXT: v_mov_b32_e32 v33, v17 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: .LBB117_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32bf16_to_v32f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v34, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-NEXT: v_dual_mov_b32 v32, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-NEXT: v_dual_mov_b32 v30, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-NEXT: v_dual_mov_b32 v28, s9 :: v_dual_mov_b32 v27, s8 +; GFX11-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v25, s6 +; GFX11-NEXT: v_dual_mov_b32 v24, s5 :: v_dual_mov_b32 v23, s4 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB117_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 +; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 +; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 +; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 +; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 +; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: .LBB117_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1802,8 +16689,330 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v32i16: + define void @v_bitcast_v32bf16_to_v32i16(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v32bf16_to_v32i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b32 s10, s4 +; GCN-NEXT: s_mov_b32 s11, s4 +; GCN-NEXT: s_mov_b32 s12, s4 +; GCN-NEXT: s_mov_b32 s13, s4 +; GCN-NEXT: s_mov_b32 s14, s4 +; GCN-NEXT: s_mov_b32 s15, s4 +; GCN-NEXT: s_mov_b32 s16, s4 +; GCN-NEXT: s_mov_b32 s17, s4 +; GCN-NEXT: s_mov_b32 s18, s4 +; GCN-NEXT: s_mov_b32 s19, s4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v46, s19 +; GCN-NEXT: v_mov_b32_e32 v45, s18 +; GCN-NEXT: v_mov_b32_e32 v44, s17 +; GCN-NEXT: v_mov_b32_e32 v43, s16 +; GCN-NEXT: v_mov_b32_e32 v42, s15 +; GCN-NEXT: v_mov_b32_e32 v41, s14 +; GCN-NEXT: v_mov_b32_e32 v40, s13 +; GCN-NEXT: v_mov_b32_e32 v39, s12 +; GCN-NEXT: v_mov_b32_e32 v38, s11 +; GCN-NEXT: v_mov_b32_e32 v37, s10 +; GCN-NEXT: v_mov_b32_e32 v36, s9 +; GCN-NEXT: v_mov_b32_e32 v35, s8 +; GCN-NEXT: v_mov_b32_e32 v34, s7 +; GCN-NEXT: v_mov_b32_e32 v33, s6 +; GCN-NEXT: v_mov_b32_e32 v32, s5 +; GCN-NEXT: v_mov_b32_e32 v31, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB118_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v51 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16 +; GCN-NEXT: v_alignbit_b32 v39, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v40, v20, v21, 16 +; GCN-NEXT: v_alignbit_b32 v41, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v42, v24, v25, 16 +; GCN-NEXT: v_alignbit_b32 v43, v26, v27, 16 +; GCN-NEXT: v_alignbit_b32 v44, v28, v29, 16 +; GCN-NEXT: v_alignbit_b32 v45, v30, v50, 16 +; GCN-NEXT: v_alignbit_b32 v46, v49, v48, 16 +; GCN-NEXT: .LBB118_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[8:11], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[8:11], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32bf16_to_v32i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v33, s18 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v29, s14 +; VI-NEXT: v_mov_b32_e32 v28, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v26, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB118_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v34, v18 +; VI-NEXT: v_mov_b32_e32 v33, v17 +; VI-NEXT: v_mov_b32_e32 v32, v16 +; VI-NEXT: v_mov_b32_e32 v31, v15 +; VI-NEXT: v_mov_b32_e32 v30, v14 +; VI-NEXT: v_mov_b32_e32 v29, v13 +; VI-NEXT: v_mov_b32_e32 v28, v12 +; VI-NEXT: v_mov_b32_e32 v27, v11 +; VI-NEXT: v_mov_b32_e32 v26, v10 +; VI-NEXT: v_mov_b32_e32 v25, v9 +; VI-NEXT: v_mov_b32_e32 v24, v8 +; VI-NEXT: v_mov_b32_e32 v23, v7 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v5 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: .LBB118_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32bf16_to_v32i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s17 +; GFX9-NEXT: v_mov_b32_e32 v31, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v29, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v27, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s11 +; GFX9-NEXT: v_mov_b32_e32 v25, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB118_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v34, v18 +; GFX9-NEXT: v_mov_b32_e32 v33, v17 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: .LBB118_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32bf16_to_v32i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v34, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-NEXT: v_dual_mov_b32 v32, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-NEXT: v_dual_mov_b32 v30, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-NEXT: v_dual_mov_b32 v28, s9 :: v_dual_mov_b32 v27, s8 +; GFX11-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v25, s6 +; GFX11-NEXT: v_dual_mov_b32 v24, s5 :: v_dual_mov_b32 v23, s4 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB118_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 +; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 +; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 +; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 +; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 +; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: .LBB118_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1818,8 +17027,330 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v64i8: + define void @v_bitcast_v32bf16_to_v64i8(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v32bf16_to_v64i8: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b32 s10, s4 +; GCN-NEXT: s_mov_b32 s11, s4 +; GCN-NEXT: s_mov_b32 s12, s4 +; GCN-NEXT: s_mov_b32 s13, s4 +; GCN-NEXT: s_mov_b32 s14, s4 +; GCN-NEXT: s_mov_b32 s15, s4 +; GCN-NEXT: s_mov_b32 s16, s4 +; GCN-NEXT: s_mov_b32 s17, s4 +; GCN-NEXT: s_mov_b32 s18, s4 +; GCN-NEXT: s_mov_b32 s19, s4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v46, s19 +; GCN-NEXT: v_mov_b32_e32 v45, s18 +; GCN-NEXT: v_mov_b32_e32 v44, s17 +; GCN-NEXT: v_mov_b32_e32 v43, s16 +; GCN-NEXT: v_mov_b32_e32 v42, s15 +; GCN-NEXT: v_mov_b32_e32 v41, s14 +; GCN-NEXT: v_mov_b32_e32 v40, s13 +; GCN-NEXT: v_mov_b32_e32 v39, s12 +; GCN-NEXT: v_mov_b32_e32 v38, s11 +; GCN-NEXT: v_mov_b32_e32 v37, s10 +; GCN-NEXT: v_mov_b32_e32 v36, s9 +; GCN-NEXT: v_mov_b32_e32 v35, s8 +; GCN-NEXT: v_mov_b32_e32 v34, s7 +; GCN-NEXT: v_mov_b32_e32 v33, s6 +; GCN-NEXT: v_mov_b32_e32 v32, s5 +; GCN-NEXT: v_mov_b32_e32 v31, s4 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB119_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v51 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16 +; GCN-NEXT: v_alignbit_b32 v39, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v40, v20, v21, 16 +; GCN-NEXT: v_alignbit_b32 v41, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v42, v24, v25, 16 +; GCN-NEXT: v_alignbit_b32 v43, v26, v27, 16 +; GCN-NEXT: v_alignbit_b32 v44, v28, v29, 16 +; GCN-NEXT: v_alignbit_b32 v45, v30, v50, 16 +; GCN-NEXT: v_alignbit_b32 v46, v49, v48, 16 +; GCN-NEXT: .LBB119_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b32 s5, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[8:11], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[8:11], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32bf16_to_v64i8: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v33, s18 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v29, s14 +; VI-NEXT: v_mov_b32_e32 v28, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v26, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB119_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v34, v18 +; VI-NEXT: v_mov_b32_e32 v33, v17 +; VI-NEXT: v_mov_b32_e32 v32, v16 +; VI-NEXT: v_mov_b32_e32 v31, v15 +; VI-NEXT: v_mov_b32_e32 v30, v14 +; VI-NEXT: v_mov_b32_e32 v29, v13 +; VI-NEXT: v_mov_b32_e32 v28, v12 +; VI-NEXT: v_mov_b32_e32 v27, v11 +; VI-NEXT: v_mov_b32_e32 v26, v10 +; VI-NEXT: v_mov_b32_e32 v25, v9 +; VI-NEXT: v_mov_b32_e32 v24, v8 +; VI-NEXT: v_mov_b32_e32 v23, v7 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v5 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: .LBB119_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32bf16_to_v64i8: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s17 +; GFX9-NEXT: v_mov_b32_e32 v31, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v29, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v27, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s11 +; GFX9-NEXT: v_mov_b32_e32 v25, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB119_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v34, v18 +; GFX9-NEXT: v_mov_b32_e32 v33, v17 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: .LBB119_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32bf16_to_v64i8: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v34, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-NEXT: v_dual_mov_b32 v32, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-NEXT: v_dual_mov_b32 v30, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-NEXT: v_dual_mov_b32 v28, s9 :: v_dual_mov_b32 v27, s8 +; GFX11-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v25, s6 +; GFX11-NEXT: v_dual_mov_b32 v24, s5 :: v_dual_mov_b32 v23, s4 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB119_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 +; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 +; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 +; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 +; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 +; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: .LBB119_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1834,8 +17365,1190 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v64i8_to_v32bf16: + define void @v_bitcast_v64i8_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <64 x i8> %value) { +; GCN-LABEL: v_bitcast_v64i8_to_v32bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: v_mov_b32_e32 v35, 0 +; GCN-NEXT: v_mov_b32_e32 v37, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v38, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NEXT: v_mov_b32_e32 v63, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB120_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GCN-NEXT: v_or_b32_e32 v31, v0, v7 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v16 +; GCN-NEXT: v_or_b32_e32 v0, v0, v7 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v7, v8 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v44 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v8, v11 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v4 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 24, v6 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v8, 24, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v10, 24, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 24, v43 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v42 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v17 +; GCN-NEXT: v_and_b32_e32 v51, 0xff, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v61 +; GCN-NEXT: v_and_b32_e32 v55, 0xff, v60 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v59 +; GCN-NEXT: v_and_b32_e32 v40, 0xff, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 +; GCN-NEXT: v_and_b32_e32 v41, 0xff, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v47 +; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v56, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v44, 24, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v58, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v43, 24, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v59, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v61, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v42, 24, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v62, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v63, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v24, 24, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; GCN-NEXT: v_or_b32_e32 v12, v51, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v41 +; GCN-NEXT: v_or_b32_e32 v45, v46, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v58 +; GCN-NEXT: v_or_b32_e32 v58, v59, v60 +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v61 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v63 +; GCN-NEXT: v_or_b32_e32 v0, v0, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v34 +; GCN-NEXT: v_or_b32_e32 v49, v49, v3 +; GCN-NEXT: v_or_b32_e32 v52, v52, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_or_b32_e32 v48, v48, v4 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v3, v36 +; GCN-NEXT: v_or_b32_e32 v40, v7, v37 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; GCN-NEXT: v_or_b32_e32 v41, v8, v11 +; GCN-NEXT: v_or_b32_e32 v22, v6, v20 +; GCN-NEXT: v_or_b32_e32 v20, v9, v35 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v3 +; GCN-NEXT: v_or_b32_e32 v53, v10, v29 +; GCN-NEXT: v_or_b32_e32 v21, v21, v30 +; GCN-NEXT: v_or_b32_e32 v19, v19, v32 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v3 +; GCN-NEXT: v_or_b32_e32 v54, v54, v26 +; GCN-NEXT: v_or_b32_e32 v35, v25, v27 +; GCN-NEXT: v_or_b32_e32 v37, v15, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v12 +; GCN-NEXT: v_or_b32_e32 v25, v16, v13 +; GCN-NEXT: v_or_b32_e32 v36, v57, v14 +; GCN-NEXT: v_or_b32_e32 v38, v38, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v45 +; GCN-NEXT: v_or_b32_e32 v39, v39, v46 +; GCN-NEXT: v_or_b32_e32 v63, v44, v47 +; GCN-NEXT: v_or_b32_e32 v29, v43, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v58 +; GCN-NEXT: v_or_b32_e32 v34, v42, v59 +; GCN-NEXT: v_or_b32_e32 v30, v23, v60 +; GCN-NEXT: v_or_b32_e32 v28, v24, v61 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v17, v62 +; GCN-NEXT: .LBB120_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v48 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v31 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v40 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v55 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v54 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v38 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v36 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v34 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 +; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 +; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16 +; GCN-NEXT: v_alignbit_b32 v11, v19, v20, 16 +; GCN-NEXT: v_alignbit_b32 v12, v21, v22, 16 +; GCN-NEXT: v_alignbit_b32 v13, v23, v24, 16 +; GCN-NEXT: v_alignbit_b32 v14, v25, v26, 16 +; GCN-NEXT: v_alignbit_b32 v15, v27, v29, 16 +; GCN-NEXT: v_alignbit_b32 v16, v31, v32, 16 +; GCN-NEXT: v_alignbit_b32 v17, v28, v30, 16 +; GCN-NEXT: v_alignbit_b32 v18, v0, v33, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v64i8_to_v32bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v46, s19 +; VI-NEXT: v_mov_b32_e32 v45, s18 +; VI-NEXT: v_mov_b32_e32 v44, s17 +; VI-NEXT: v_mov_b32_e32 v43, s16 +; VI-NEXT: v_mov_b32_e32 v42, s15 +; VI-NEXT: v_mov_b32_e32 v41, s14 +; VI-NEXT: v_mov_b32_e32 v40, s13 +; VI-NEXT: v_mov_b32_e32 v39, s12 +; VI-NEXT: v_mov_b32_e32 v38, s11 +; VI-NEXT: v_mov_b32_e32 v37, s10 +; VI-NEXT: v_mov_b32_e32 v36, s9 +; VI-NEXT: v_mov_b32_e32 v35, s8 +; VI-NEXT: v_mov_b32_e32 v34, s7 +; VI-NEXT: v_mov_b32_e32 v33, s6 +; VI-NEXT: v_mov_b32_e32 v32, s5 +; VI-NEXT: v_mov_b32_e32 v31, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB120_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v6 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v6, 8, v10 +; VI-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v32, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v12 +; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v15 +; VI-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v19 +; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v22, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v23 +; VI-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v25, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v40, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v27 +; VI-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v41, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v29 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v63 +; VI-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v42, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v61 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v59 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v43, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v47 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v44, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v52 +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v45, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v48 +; VI-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v46, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: .LBB120_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[35:38] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[31:34] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v64i8_to_v32bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v46, s19 +; GFX9-NEXT: v_mov_b32_e32 v45, s18 +; GFX9-NEXT: v_mov_b32_e32 v44, s17 +; GFX9-NEXT: v_mov_b32_e32 v43, s16 +; GFX9-NEXT: v_mov_b32_e32 v42, s15 +; GFX9-NEXT: v_mov_b32_e32 v41, s14 +; GFX9-NEXT: v_mov_b32_e32 v40, s13 +; GFX9-NEXT: v_mov_b32_e32 v39, s12 +; GFX9-NEXT: v_mov_b32_e32 v38, s11 +; GFX9-NEXT: v_mov_b32_e32 v37, s10 +; GFX9-NEXT: v_mov_b32_e32 v36, s9 +; GFX9-NEXT: v_mov_b32_e32 v35, s8 +; GFX9-NEXT: v_mov_b32_e32 v34, s7 +; GFX9-NEXT: v_mov_b32_e32 v33, s6 +; GFX9-NEXT: v_mov_b32_e32 v32, s5 +; GFX9-NEXT: v_mov_b32_e32 v31, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB120_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v6 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, 8, v10 +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v31, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v32, v6, v5, s6 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v33, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v34, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v35, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v36, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v37, v4, v3, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v38, v4, v3, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v22, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v39, v4, v3, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v25, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v40, v4, v3, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v26 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v41, v4, v3, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v42, v3, v0, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v61 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v43, v3, v0, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v44, v3, v0, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v45, v3, v0, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v46, v3, v0, s6 +; GFX9-NEXT: .LBB120_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v64i8_to_v32bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x6 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:168 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:164 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:160 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:156 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:152 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:148 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:144 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:140 +; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:136 +; GFX11-NEXT: scratch_load_u16 v50, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v51, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v52, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v53, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v54, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v55, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:88 +; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:20 +; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:16 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u16 v115, off, s32 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v46, s15 :: v_dual_mov_b32 v45, s14 +; GFX11-NEXT: v_dual_mov_b32 v44, s13 :: v_dual_mov_b32 v43, s12 +; GFX11-NEXT: v_dual_mov_b32 v42, s11 :: v_dual_mov_b32 v41, s10 +; GFX11-NEXT: v_dual_mov_b32 v40, s9 :: v_dual_mov_b32 v39, s8 +; GFX11-NEXT: v_dual_mov_b32 v38, s7 :: v_dual_mov_b32 v37, s6 +; GFX11-NEXT: v_dual_mov_b32 v36, s5 :: v_dual_mov_b32 v35, s4 +; GFX11-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v33, s2 +; GFX11-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v31, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB120_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v14 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v24 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v8 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v10 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v13 +; GFX11-NEXT: v_perm_b32 v31, v3, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v12 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v16 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v18 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v19 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v20 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v21 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v22 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v23 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v26 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_perm_b32 v32, v5, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v33, v3, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v34, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v35, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v36, v11, v10, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v28 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v115 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v114 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v113 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v112 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v103 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v102 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v101 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v100 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v99 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v98 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v97 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v87 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v86 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v85 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v84 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_perm_b32 v37, v3, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v38, v5, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v39, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v40, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v41, v11, v10, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v82 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v81 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v71 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v70 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v69 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v68 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v67 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v66 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v65 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v53 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v52 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v51 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v50 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v49 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v48 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_perm_b32 v42, v3, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v43, v5, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v44, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v45, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v46, v11, v10, 0x5040100 +; GFX11-NEXT: .LBB120_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off +; GFX11-NEXT: s_clause 0x6 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:168 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1850,8 +18563,387 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v32i16_to_v32bf16: + define void @v_bitcast_v32i16_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <32 x i16> %value) { +; GCN-LABEL: v_bitcast_v32i16_to_v32bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v46, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v58, 0 +; GCN-NEXT: v_mov_b32_e32 v47, 0 +; GCN-NEXT: v_mov_b32_e32 v59, 0 +; GCN-NEXT: v_mov_b32_e32 v56, 0 +; GCN-NEXT: v_mov_b32_e32 v60, 0 +; GCN-NEXT: v_mov_b32_e32 v57, 0 +; GCN-NEXT: v_mov_b32_e32 v61, 0 +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: v_mov_b32_e32 v45, 0 +; GCN-NEXT: v_mov_b32_e32 v38, 0 +; GCN-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NEXT: v_mov_b32_e32 v35, 0 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NEXT: v_mov_b32_e32 v37, 0 +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB121_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v28 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v63 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v62 +; GCN-NEXT: .LBB121_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v58 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v46 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v59 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v60 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v56 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v61 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v57 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v42 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v54 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v43 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v44 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v40 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v45 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v38 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v48 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v31 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v36 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v6, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v7, v10, v11, 16 +; GCN-NEXT: v_alignbit_b32 v8, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v9, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v10, v16, v17, 16 +; GCN-NEXT: v_alignbit_b32 v11, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v12, v20, v21, 16 +; GCN-NEXT: v_alignbit_b32 v13, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v14, v24, v25, 16 +; GCN-NEXT: v_alignbit_b32 v15, v26, v27, 16 +; GCN-NEXT: v_alignbit_b32 v16, v28, v30, 16 +; GCN-NEXT: v_alignbit_b32 v17, v31, v32, 16 +; GCN-NEXT: v_alignbit_b32 v18, v29, v33, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32i16_to_v32bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v33, s18 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v29, s14 +; VI-NEXT: v_mov_b32_e32 v28, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v26, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB121_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v34, v18 +; VI-NEXT: v_mov_b32_e32 v33, v17 +; VI-NEXT: v_mov_b32_e32 v32, v16 +; VI-NEXT: v_mov_b32_e32 v31, v15 +; VI-NEXT: v_mov_b32_e32 v30, v14 +; VI-NEXT: v_mov_b32_e32 v29, v13 +; VI-NEXT: v_mov_b32_e32 v28, v12 +; VI-NEXT: v_mov_b32_e32 v27, v11 +; VI-NEXT: v_mov_b32_e32 v26, v10 +; VI-NEXT: v_mov_b32_e32 v25, v9 +; VI-NEXT: v_mov_b32_e32 v24, v8 +; VI-NEXT: v_mov_b32_e32 v23, v7 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v5 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: .LBB121_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32i16_to_v32bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s17 +; GFX9-NEXT: v_mov_b32_e32 v31, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v29, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v27, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s11 +; GFX9-NEXT: v_mov_b32_e32 v25, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB121_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v34, v18 +; GFX9-NEXT: v_mov_b32_e32 v33, v17 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: .LBB121_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32i16_to_v32bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v34, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-NEXT: v_dual_mov_b32 v32, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-NEXT: v_dual_mov_b32 v30, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-NEXT: v_dual_mov_b32 v28, s9 :: v_dual_mov_b32 v27, s8 +; GFX11-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v25, s6 +; GFX11-NEXT: v_dual_mov_b32 v24, s5 :: v_dual_mov_b32 v23, s4 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB121_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 +; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 +; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 +; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 +; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 +; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: .LBB121_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1866,8 +18958,416 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v32f16_to_v32bf16: + define void @v_bitcast_v32f16_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <32 x half> %value) { +; GCN-LABEL: v_bitcast_v32f16_to_v32bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v46, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v58, 0 +; GCN-NEXT: v_mov_b32_e32 v47, 0 +; GCN-NEXT: v_mov_b32_e32 v59, 0 +; GCN-NEXT: v_mov_b32_e32 v56, 0 +; GCN-NEXT: v_mov_b32_e32 v60, 0 +; GCN-NEXT: v_mov_b32_e32 v57, 0 +; GCN-NEXT: v_mov_b32_e32 v61, 0 +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: v_mov_b32_e32 v45, 0 +; GCN-NEXT: v_mov_b32_e32 v38, 0 +; GCN-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NEXT: v_mov_b32_e32 v35, 0 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NEXT: v_mov_b32_e32 v37, 0 +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB122_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v31 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v62 +; GCN-NEXT: v_cvt_f16_f32_e32 v62, v63 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; GCN-NEXT: .LBB122_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v58 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v46 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v59 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v47 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v60 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v56 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v61 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v57 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v42 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v54 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v43 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v44 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v40 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v45 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v38 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v48 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v36 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 +; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 +; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16 +; GCN-NEXT: v_alignbit_b32 v11, v19, v20, 16 +; GCN-NEXT: v_alignbit_b32 v12, v21, v22, 16 +; GCN-NEXT: v_alignbit_b32 v13, v23, v24, 16 +; GCN-NEXT: v_alignbit_b32 v14, v25, v26, 16 +; GCN-NEXT: v_alignbit_b32 v15, v27, v28, 16 +; GCN-NEXT: v_alignbit_b32 v16, v29, v30, 16 +; GCN-NEXT: v_alignbit_b32 v17, v31, v32, 16 +; GCN-NEXT: v_alignbit_b32 v18, v0, v33, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32f16_to_v32bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v33, s18 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v29, s14 +; VI-NEXT: v_mov_b32_e32 v28, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v26, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB122_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v34, v18 +; VI-NEXT: v_mov_b32_e32 v33, v17 +; VI-NEXT: v_mov_b32_e32 v32, v16 +; VI-NEXT: v_mov_b32_e32 v31, v15 +; VI-NEXT: v_mov_b32_e32 v30, v14 +; VI-NEXT: v_mov_b32_e32 v29, v13 +; VI-NEXT: v_mov_b32_e32 v28, v12 +; VI-NEXT: v_mov_b32_e32 v27, v11 +; VI-NEXT: v_mov_b32_e32 v26, v10 +; VI-NEXT: v_mov_b32_e32 v25, v9 +; VI-NEXT: v_mov_b32_e32 v24, v8 +; VI-NEXT: v_mov_b32_e32 v23, v7 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v5 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: .LBB122_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32f16_to_v32bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s17 +; GFX9-NEXT: v_mov_b32_e32 v31, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v29, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v27, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s11 +; GFX9-NEXT: v_mov_b32_e32 v25, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB122_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v34, v18 +; GFX9-NEXT: v_mov_b32_e32 v33, v17 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: .LBB122_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32f16_to_v32bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v34, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-NEXT: v_dual_mov_b32 v32, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-NEXT: v_dual_mov_b32 v30, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-NEXT: v_dual_mov_b32 v28, s9 :: v_dual_mov_b32 v27, s8 +; GFX11-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v25, s6 +; GFX11-NEXT: v_dual_mov_b32 v24, s5 :: v_dual_mov_b32 v23, s4 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB122_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 +; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 +; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 +; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 +; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 +; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: .LBB122_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1882,8 +19382,344 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v16i32_to_v32bf16: + define void @v_bitcast_v16i32_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <16 x i32> %value) { +; GCN-LABEL: v_bitcast_v16i32_to_v32bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: v_mov_b32_e32 v38, 0 +; GCN-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v37, 0 +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: v_mov_b32_e32 v35, 0 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB123_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5 +; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; GCN-NEXT: .LBB123_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 +; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 +; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16 +; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16 +; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16 +; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16 +; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16 +; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16 +; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16 +; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16 +; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v16i32_to_v32bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v33, s18 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v29, s14 +; VI-NEXT: v_mov_b32_e32 v28, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v26, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB123_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v34, v18 +; VI-NEXT: v_mov_b32_e32 v33, v17 +; VI-NEXT: v_mov_b32_e32 v32, v16 +; VI-NEXT: v_mov_b32_e32 v31, v15 +; VI-NEXT: v_mov_b32_e32 v30, v14 +; VI-NEXT: v_mov_b32_e32 v29, v13 +; VI-NEXT: v_mov_b32_e32 v28, v12 +; VI-NEXT: v_mov_b32_e32 v27, v11 +; VI-NEXT: v_mov_b32_e32 v26, v10 +; VI-NEXT: v_mov_b32_e32 v25, v9 +; VI-NEXT: v_mov_b32_e32 v24, v8 +; VI-NEXT: v_mov_b32_e32 v23, v7 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v5 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: .LBB123_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v16i32_to_v32bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s17 +; GFX9-NEXT: v_mov_b32_e32 v31, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v29, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v27, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s11 +; GFX9-NEXT: v_mov_b32_e32 v25, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB123_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v34, v18 +; GFX9-NEXT: v_mov_b32_e32 v33, v17 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: .LBB123_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v16i32_to_v32bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v34, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-NEXT: v_dual_mov_b32 v32, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-NEXT: v_dual_mov_b32 v30, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-NEXT: v_dual_mov_b32 v28, s9 :: v_dual_mov_b32 v27, s8 +; GFX11-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v25, s6 +; GFX11-NEXT: v_dual_mov_b32 v24, s5 :: v_dual_mov_b32 v23, s4 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB123_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 +; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 +; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 +; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 +; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 +; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: .LBB123_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1898,8 +19734,344 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v16f32_to_v32bf16: + define void @v_bitcast_v16f32_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <16 x float> %value) { +; GCN-LABEL: v_bitcast_v16f32_to_v32bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: v_mov_b32_e32 v38, 0 +; GCN-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v37, 0 +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: v_mov_b32_e32 v35, 0 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB124_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5 +; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; GCN-NEXT: .LBB124_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 +; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 +; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16 +; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16 +; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16 +; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16 +; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16 +; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16 +; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16 +; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16 +; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v16f32_to_v32bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v33, s18 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v29, s14 +; VI-NEXT: v_mov_b32_e32 v28, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v26, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB124_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v34, v18 +; VI-NEXT: v_mov_b32_e32 v33, v17 +; VI-NEXT: v_mov_b32_e32 v32, v16 +; VI-NEXT: v_mov_b32_e32 v31, v15 +; VI-NEXT: v_mov_b32_e32 v30, v14 +; VI-NEXT: v_mov_b32_e32 v29, v13 +; VI-NEXT: v_mov_b32_e32 v28, v12 +; VI-NEXT: v_mov_b32_e32 v27, v11 +; VI-NEXT: v_mov_b32_e32 v26, v10 +; VI-NEXT: v_mov_b32_e32 v25, v9 +; VI-NEXT: v_mov_b32_e32 v24, v8 +; VI-NEXT: v_mov_b32_e32 v23, v7 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v5 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: .LBB124_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v16f32_to_v32bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s17 +; GFX9-NEXT: v_mov_b32_e32 v31, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v29, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v27, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s11 +; GFX9-NEXT: v_mov_b32_e32 v25, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB124_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v34, v18 +; GFX9-NEXT: v_mov_b32_e32 v33, v17 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: .LBB124_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v16f32_to_v32bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v34, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-NEXT: v_dual_mov_b32 v32, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-NEXT: v_dual_mov_b32 v30, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-NEXT: v_dual_mov_b32 v28, s9 :: v_dual_mov_b32 v27, s8 +; GFX11-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v25, s6 +; GFX11-NEXT: v_dual_mov_b32 v24, s5 :: v_dual_mov_b32 v23, s4 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB124_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 +; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 +; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 +; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 +; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 +; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: .LBB124_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1914,8 +20086,344 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v8f64_to_v32bf16: + define void @v_bitcast_v8f64_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <8 x double> %value) { +; GCN-LABEL: v_bitcast_v8f64_to_v32bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: v_mov_b32_e32 v38, 0 +; GCN-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v37, 0 +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: v_mov_b32_e32 v35, 0 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB125_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5 +; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; GCN-NEXT: .LBB125_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 +; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 +; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16 +; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16 +; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16 +; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16 +; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16 +; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16 +; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16 +; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16 +; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8f64_to_v32bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v33, s18 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v29, s14 +; VI-NEXT: v_mov_b32_e32 v28, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v26, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB125_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v34, v18 +; VI-NEXT: v_mov_b32_e32 v33, v17 +; VI-NEXT: v_mov_b32_e32 v32, v16 +; VI-NEXT: v_mov_b32_e32 v31, v15 +; VI-NEXT: v_mov_b32_e32 v30, v14 +; VI-NEXT: v_mov_b32_e32 v29, v13 +; VI-NEXT: v_mov_b32_e32 v28, v12 +; VI-NEXT: v_mov_b32_e32 v27, v11 +; VI-NEXT: v_mov_b32_e32 v26, v10 +; VI-NEXT: v_mov_b32_e32 v25, v9 +; VI-NEXT: v_mov_b32_e32 v24, v8 +; VI-NEXT: v_mov_b32_e32 v23, v7 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v5 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: .LBB125_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8f64_to_v32bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s17 +; GFX9-NEXT: v_mov_b32_e32 v31, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v29, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v27, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s11 +; GFX9-NEXT: v_mov_b32_e32 v25, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB125_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v34, v18 +; GFX9-NEXT: v_mov_b32_e32 v33, v17 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: .LBB125_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8f64_to_v32bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v34, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-NEXT: v_dual_mov_b32 v32, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-NEXT: v_dual_mov_b32 v30, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-NEXT: v_dual_mov_b32 v28, s9 :: v_dual_mov_b32 v27, s8 +; GFX11-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v25, s6 +; GFX11-NEXT: v_dual_mov_b32 v24, s5 :: v_dual_mov_b32 v23, s4 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB125_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 +; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 +; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 +; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 +; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 +; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: .LBB125_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1930,8 +20438,344 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v8i64_to_v32bf16: + define void @v_bitcast_v8i64_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <8 x i64> %value) { +; GCN-LABEL: v_bitcast_v8i64_to_v32bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: v_mov_b32_e32 v38, 0 +; GCN-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v37, 0 +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: v_mov_b32_e32 v35, 0 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB126_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5 +; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; GCN-NEXT: .LBB126_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 +; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 +; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16 +; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16 +; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16 +; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16 +; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16 +; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16 +; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16 +; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16 +; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8i64_to_v32bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v33, s18 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v29, s14 +; VI-NEXT: v_mov_b32_e32 v28, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v26, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s7 +; VI-NEXT: v_mov_b32_e32 v21, s6 +; VI-NEXT: v_mov_b32_e32 v20, s5 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB126_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v34, v18 +; VI-NEXT: v_mov_b32_e32 v33, v17 +; VI-NEXT: v_mov_b32_e32 v32, v16 +; VI-NEXT: v_mov_b32_e32 v31, v15 +; VI-NEXT: v_mov_b32_e32 v30, v14 +; VI-NEXT: v_mov_b32_e32 v29, v13 +; VI-NEXT: v_mov_b32_e32 v28, v12 +; VI-NEXT: v_mov_b32_e32 v27, v11 +; VI-NEXT: v_mov_b32_e32 v26, v10 +; VI-NEXT: v_mov_b32_e32 v25, v9 +; VI-NEXT: v_mov_b32_e32 v24, v8 +; VI-NEXT: v_mov_b32_e32 v23, v7 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v5 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: .LBB126_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8i64_to_v32bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s17 +; GFX9-NEXT: v_mov_b32_e32 v31, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v29, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v27, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s11 +; GFX9-NEXT: v_mov_b32_e32 v25, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s7 +; GFX9-NEXT: v_mov_b32_e32 v21, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB126_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v34, v18 +; GFX9-NEXT: v_mov_b32_e32 v33, v17 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: .LBB126_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8i64_to_v32bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v34, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-NEXT: v_dual_mov_b32 v32, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-NEXT: v_dual_mov_b32 v30, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-NEXT: v_dual_mov_b32 v28, s9 :: v_dual_mov_b32 v27, s8 +; GFX11-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v25, s6 +; GFX11-NEXT: v_dual_mov_b32 v24, s5 :: v_dual_mov_b32 v23, s4 +; GFX11-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v21, s2 +; GFX11-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v19, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB126_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 +; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 +; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 +; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 +; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 +; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 +; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 +; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: .LBB126_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1946,15 +20790,3279 @@ end: ret void } +define <32 x half> @v_bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { +; GCN-LABEL: v_bitcast_v8i64_to_v32f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v34, v15 +; GCN-NEXT: v_mov_b32_e32 v33, v14 +; GCN-NEXT: v_mov_b32_e32 v36, v13 +; GCN-NEXT: v_mov_b32_e32 v35, v12 +; GCN-NEXT: v_mov_b32_e32 v38, v11 +; GCN-NEXT: v_mov_b32_e32 v37, v10 +; GCN-NEXT: v_mov_b32_e32 v48, v9 +; GCN-NEXT: v_mov_b32_e32 v39, v8 +; GCN-NEXT: v_mov_b32_e32 v50, v7 +; GCN-NEXT: v_mov_b32_e32 v49, v6 +; GCN-NEXT: v_mov_b32_e32 v52, v5 +; GCN-NEXT: v_mov_b32_e32 v51, v4 +; GCN-NEXT: v_mov_b32_e32 v54, v3 +; GCN-NEXT: v_mov_b32_e32 v53, v2 +; GCN-NEXT: v_mov_b32_e32 v55, v1 +; GCN-NEXT: v_mov_b32_e32 v32, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB127_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v50 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v49 +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v52 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v51 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v54 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v53 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v55 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: .LBB127_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB127_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v55, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v54, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v52, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v50, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v48, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v38, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v36, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v34, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 +; GCN-NEXT: .LBB127_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8i64_to_v32f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB127_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB127_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8i64_to_v32f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB127_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB127_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8i64_to_v32f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB127_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: .LBB127_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <32 x half> + br label %end +cmp.false: + %a3 = bitcast <8 x i64> %a to <32 x half> + br label %end +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} +define <32 x i16> @v_bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { +; GCN-LABEL: v_bitcast_v8i64_to_v32i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v30, v15 +; GCN-NEXT: v_mov_b32_e32 v28, v14 +; GCN-NEXT: v_mov_b32_e32 v26, v13 +; GCN-NEXT: v_mov_b32_e32 v24, v12 +; GCN-NEXT: v_mov_b32_e32 v22, v11 +; GCN-NEXT: v_mov_b32_e32 v20, v10 +; GCN-NEXT: v_mov_b32_e32 v18, v9 +; GCN-NEXT: v_mov_b32_e32 v32, v8 +; GCN-NEXT: v_mov_b32_e32 v14, v7 +; GCN-NEXT: v_mov_b32_e32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v10, v5 +; GCN-NEXT: v_mov_b32_e32 v8, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v3 +; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB128_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: .LBB128_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NEXT: v_mov_b32_e32 v16, v32 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8i64_to_v32i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB128_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB128_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8i64_to_v32i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB128_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB128_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8i64_to_v32i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB128_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: .LBB128_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <32 x i16> + br label %end +cmp.false: + %a3 = bitcast <8 x i64> %a to <32 x i16> + br label %end +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} +define <32 x i16> @v_bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { +; GCN-LABEL: v_bitcast_v8f64_to_v32i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v55, v15 +; GCN-NEXT: v_mov_b32_e32 v54, v14 +; GCN-NEXT: v_mov_b32_e32 v53, v13 +; GCN-NEXT: v_mov_b32_e32 v52, v12 +; GCN-NEXT: v_mov_b32_e32 v51, v11 +; GCN-NEXT: v_mov_b32_e32 v50, v10 +; GCN-NEXT: v_mov_b32_e32 v49, v9 +; GCN-NEXT: v_mov_b32_e32 v48, v8 +; GCN-NEXT: v_mov_b32_e32 v38, v7 +; GCN-NEXT: v_mov_b32_e32 v37, v6 +; GCN-NEXT: v_mov_b32_e32 v36, v5 +; GCN-NEXT: v_mov_b32_e32 v35, v4 +; GCN-NEXT: v_mov_b32_e32 v34, v3 +; GCN-NEXT: v_mov_b32_e32 v33, v2 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB129_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 +; GCN-NEXT: v_add_f64 v[52:53], v[52:53], 1.0 +; GCN-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 +; GCN-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 +; GCN-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 +; GCN-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 +; GCN-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB129_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v5, v34, v33, 16 +; GCN-NEXT: v_alignbit_b32 v9, v36, v35, 16 +; GCN-NEXT: v_alignbit_b32 v13, v38, v37, 16 +; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16 +; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16 +; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16 +; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v33 +; GCN-NEXT: v_mov_b32_e32 v6, v34 +; GCN-NEXT: v_mov_b32_e32 v8, v35 +; GCN-NEXT: v_mov_b32_e32 v10, v36 +; GCN-NEXT: v_mov_b32_e32 v12, v37 +; GCN-NEXT: v_mov_b32_e32 v14, v38 +; GCN-NEXT: v_mov_b32_e32 v16, v48 +; GCN-NEXT: v_mov_b32_e32 v18, v49 +; GCN-NEXT: v_mov_b32_e32 v20, v50 +; GCN-NEXT: v_mov_b32_e32 v22, v51 +; GCN-NEXT: v_mov_b32_e32 v24, v52 +; GCN-NEXT: v_mov_b32_e32 v26, v53 +; GCN-NEXT: v_mov_b32_e32 v28, v54 +; GCN-NEXT: v_mov_b32_e32 v30, v55 +; GCN-NEXT: v_mov_b32_e32 v1, v32 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8f64_to_v32i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB129_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB129_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8f64_to_v32i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB129_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB129_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8f64_to_v32i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB129_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB129_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <32 x i16> + br label %end +cmp.false: + %a3 = bitcast <8 x double> %a to <32 x i16> + br label %end +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + + +define <32 x half> @v_bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { +; GCN-LABEL: v_bitcast_v8f64_to_v32f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB130_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: .LBB130_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB130_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GCN-NEXT: .LBB130_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v39 +; GCN-NEXT: v_mov_b32_e32 v1, v55 +; GCN-NEXT: v_mov_b32_e32 v2, v32 +; GCN-NEXT: v_mov_b32_e32 v3, v54 +; GCN-NEXT: v_mov_b32_e32 v4, v33 +; GCN-NEXT: v_mov_b32_e32 v5, v53 +; GCN-NEXT: v_mov_b32_e32 v6, v34 +; GCN-NEXT: v_mov_b32_e32 v7, v52 +; GCN-NEXT: v_mov_b32_e32 v8, v35 +; GCN-NEXT: v_mov_b32_e32 v9, v51 +; GCN-NEXT: v_mov_b32_e32 v10, v36 +; GCN-NEXT: v_mov_b32_e32 v11, v50 +; GCN-NEXT: v_mov_b32_e32 v12, v37 +; GCN-NEXT: v_mov_b32_e32 v13, v49 +; GCN-NEXT: v_mov_b32_e32 v14, v38 +; GCN-NEXT: v_mov_b32_e32 v15, v48 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v8f64_to_v32f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB130_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB130_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v8f64_to_v32f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB130_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB130_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v8f64_to_v32f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB130_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB130_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <32 x half> + br label %end +cmp.false: + %a3 = bitcast <8 x double> %a to <32 x half> + br label %end +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + + +define <8 x i64> @v_bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { +; GCN-LABEL: v_bitcast_v32f16_to_v8i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB131_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; GCN-NEXT: v_or_b32_e32 v0, v44, v0 +; GCN-NEXT: v_or_b32_e32 v1, v42, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; GCN-NEXT: v_or_b32_e32 v2, v52, v2 +; GCN-NEXT: v_or_b32_e32 v3, v50, v3 +; GCN-NEXT: v_or_b32_e32 v4, v48, v4 +; GCN-NEXT: v_or_b32_e32 v5, v38, v5 +; GCN-NEXT: v_or_b32_e32 v6, v36, v6 +; GCN-NEXT: v_or_b32_e32 v7, v34, v7 +; GCN-NEXT: v_or_b32_e32 v8, v33, v8 +; GCN-NEXT: v_or_b32_e32 v9, v32, v9 +; GCN-NEXT: v_or_b32_e32 v10, v31, v10 +; GCN-NEXT: v_or_b32_e32 v11, v21, v11 +; GCN-NEXT: v_or_b32_e32 v12, v19, v12 +; GCN-NEXT: v_or_b32_e32 v13, v18, v13 +; GCN-NEXT: v_or_b32_e32 v14, v17, v14 +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: .LBB131_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB131_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v9, v8 +; GCN-NEXT: v_or_b32_e32 v6, v11, v10 +; GCN-NEXT: v_or_b32_e32 v7, v13, v12 +; GCN-NEXT: v_or_b32_e32 v8, v15, v14 +; GCN-NEXT: v_or_b32_e32 v9, v26, v24 +; GCN-NEXT: v_or_b32_e32 v10, v28, v27 +; GCN-NEXT: v_or_b32_e32 v11, v21, v29 +; GCN-NEXT: v_or_b32_e32 v12, v19, v25 +; GCN-NEXT: v_or_b32_e32 v13, v18, v23 +; GCN-NEXT: v_or_b32_e32 v14, v17, v22 +; GCN-NEXT: v_or_b32_e32 v15, v16, v20 +; GCN-NEXT: .LBB131_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32f16_to_v8i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB131_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB131_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32f16_to_v8i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB131_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB131_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32f16_to_v8i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB131_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB131_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <8 x i64> + br label %end +cmp.false: + %a3 = bitcast <32 x half> %a to <8 x i64> + br label %end +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + + +define <8 x double> @v_bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { +; GCN-LABEL: v_bitcast_v32f16_to_v8f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB132_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; GCN-NEXT: v_or_b32_e32 v0, v44, v0 +; GCN-NEXT: v_or_b32_e32 v1, v42, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; GCN-NEXT: v_or_b32_e32 v2, v52, v2 +; GCN-NEXT: v_or_b32_e32 v3, v50, v3 +; GCN-NEXT: v_or_b32_e32 v4, v48, v4 +; GCN-NEXT: v_or_b32_e32 v5, v38, v5 +; GCN-NEXT: v_or_b32_e32 v6, v36, v6 +; GCN-NEXT: v_or_b32_e32 v7, v34, v7 +; GCN-NEXT: v_or_b32_e32 v8, v33, v8 +; GCN-NEXT: v_or_b32_e32 v9, v32, v9 +; GCN-NEXT: v_or_b32_e32 v10, v31, v10 +; GCN-NEXT: v_or_b32_e32 v11, v21, v11 +; GCN-NEXT: v_or_b32_e32 v12, v19, v12 +; GCN-NEXT: v_or_b32_e32 v13, v18, v13 +; GCN-NEXT: v_or_b32_e32 v14, v17, v14 +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: .LBB132_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB132_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v9, v8 +; GCN-NEXT: v_or_b32_e32 v6, v11, v10 +; GCN-NEXT: v_or_b32_e32 v7, v13, v12 +; GCN-NEXT: v_or_b32_e32 v8, v15, v14 +; GCN-NEXT: v_or_b32_e32 v9, v26, v24 +; GCN-NEXT: v_or_b32_e32 v10, v28, v27 +; GCN-NEXT: v_or_b32_e32 v11, v21, v29 +; GCN-NEXT: v_or_b32_e32 v12, v19, v25 +; GCN-NEXT: v_or_b32_e32 v13, v18, v23 +; GCN-NEXT: v_or_b32_e32 v14, v17, v22 +; GCN-NEXT: v_or_b32_e32 v15, v16, v20 +; GCN-NEXT: .LBB132_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32f16_to_v8f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB132_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB132_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32f16_to_v8f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB132_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB132_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32f16_to_v8f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB132_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB132_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <8 x double> + br label %end +cmp.false: + %a3 = bitcast <32 x half> %a to <8 x double> + br label %end +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + + +define <8 x i64> @v_bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { +; GCN-LABEL: v_bitcast_v32i16_to_v8i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v38, v14 +; GCN-NEXT: v_mov_b32_e32 v37, v12 +; GCN-NEXT: v_mov_b32_e32 v36, v10 +; GCN-NEXT: v_mov_b32_e32 v35, v8 +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB133_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB133_4 +; GCN-NEXT: .LBB133_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB133_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; GCN-NEXT: v_or_b32_e32 v0, v0, v54 +; GCN-NEXT: v_or_b32_e32 v1, v1, v55 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v2, v2, v39 +; GCN-NEXT: v_or_b32_e32 v3, v3, v48 +; GCN-NEXT: v_or_b32_e32 v4, v4, v49 +; GCN-NEXT: v_or_b32_e32 v5, v5, v50 +; GCN-NEXT: v_or_b32_e32 v6, v6, v51 +; GCN-NEXT: v_or_b32_e32 v7, v7, v52 +; GCN-NEXT: v_or_b32_e32 v8, v8, v17 +; GCN-NEXT: v_or_b32_e32 v9, v9, v19 +; GCN-NEXT: v_or_b32_e32 v10, v10, v21 +; GCN-NEXT: v_or_b32_e32 v11, v11, v23 +; GCN-NEXT: v_or_b32_e32 v12, v12, v25 +; GCN-NEXT: v_or_b32_e32 v13, v13, v27 +; GCN-NEXT: v_or_b32_e32 v14, v14, v29 +; GCN-NEXT: v_or_b32_e32 v15, v15, v53 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB133_2 +; GCN-NEXT: .LBB133_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v0, v54, v0 +; GCN-NEXT: v_or_b32_e32 v1, v55, v1 +; GCN-NEXT: v_or_b32_e32 v2, v39, v2 +; GCN-NEXT: v_or_b32_e32 v3, v48, v3 +; GCN-NEXT: v_or_b32_e32 v4, v49, v4 +; GCN-NEXT: v_or_b32_e32 v5, v50, v5 +; GCN-NEXT: v_or_b32_e32 v6, v51, v6 +; GCN-NEXT: v_or_b32_e32 v7, v52, v7 +; GCN-NEXT: v_or_b32_e32 v8, v17, v8 +; GCN-NEXT: v_or_b32_e32 v9, v19, v9 +; GCN-NEXT: v_or_b32_e32 v10, v21, v10 +; GCN-NEXT: v_or_b32_e32 v11, v23, v11 +; GCN-NEXT: v_or_b32_e32 v12, v25, v12 +; GCN-NEXT: v_or_b32_e32 v13, v27, v13 +; GCN-NEXT: v_or_b32_e32 v14, v29, v14 +; GCN-NEXT: v_or_b32_e32 v15, v53, v15 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32i16_to_v8i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB133_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v17, 3 +; VI-NEXT: v_add_u16_e32 v16, 3, v15 +; VI-NEXT: v_add_u16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_u16_e32 v16, 3, v14 +; VI-NEXT: v_add_u16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v16, v14 +; VI-NEXT: v_add_u16_e32 v16, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v16, v13 +; VI-NEXT: v_add_u16_e32 v16, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v16, v12 +; VI-NEXT: v_add_u16_e32 v16, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v16, v11 +; VI-NEXT: v_add_u16_e32 v16, 3, v10 +; VI-NEXT: v_add_u16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v16, v10 +; VI-NEXT: v_add_u16_e32 v16, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v16, v9 +; VI-NEXT: v_add_u16_e32 v16, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v16, v8 +; VI-NEXT: v_add_u16_e32 v16, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v16, v7 +; VI-NEXT: v_add_u16_e32 v16, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v16, v6 +; VI-NEXT: v_add_u16_e32 v16, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v16, v5 +; VI-NEXT: v_add_u16_e32 v16, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v16, v4 +; VI-NEXT: v_add_u16_e32 v16, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v16, v3 +; VI-NEXT: v_add_u16_e32 v16, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v16, v2 +; VI-NEXT: v_add_u16_e32 v16, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v16, v1 +; VI-NEXT: v_add_u16_e32 v16, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: .LBB133_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32i16_to_v8i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB133_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB133_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32i16_to_v8i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB133_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB133_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <8 x i64> + br label %end +cmp.false: + %a3 = bitcast <32 x i16> %a to <8 x i64> + br label %end +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + + +define <8 x double> @v_bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { +; GCN-LABEL: v_bitcast_v32i16_to_v8f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v38, v14 +; GCN-NEXT: v_mov_b32_e32 v37, v12 +; GCN-NEXT: v_mov_b32_e32 v36, v10 +; GCN-NEXT: v_mov_b32_e32 v35, v8 +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB134_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB134_4 +; GCN-NEXT: .LBB134_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB134_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; GCN-NEXT: v_or_b32_e32 v0, v0, v54 +; GCN-NEXT: v_or_b32_e32 v1, v1, v55 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v2, v2, v39 +; GCN-NEXT: v_or_b32_e32 v3, v3, v48 +; GCN-NEXT: v_or_b32_e32 v4, v4, v49 +; GCN-NEXT: v_or_b32_e32 v5, v5, v50 +; GCN-NEXT: v_or_b32_e32 v6, v6, v51 +; GCN-NEXT: v_or_b32_e32 v7, v7, v52 +; GCN-NEXT: v_or_b32_e32 v8, v8, v17 +; GCN-NEXT: v_or_b32_e32 v9, v9, v19 +; GCN-NEXT: v_or_b32_e32 v10, v10, v21 +; GCN-NEXT: v_or_b32_e32 v11, v11, v23 +; GCN-NEXT: v_or_b32_e32 v12, v12, v25 +; GCN-NEXT: v_or_b32_e32 v13, v13, v27 +; GCN-NEXT: v_or_b32_e32 v14, v14, v29 +; GCN-NEXT: v_or_b32_e32 v15, v15, v53 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB134_2 +; GCN-NEXT: .LBB134_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v0, v54, v0 +; GCN-NEXT: v_or_b32_e32 v1, v55, v1 +; GCN-NEXT: v_or_b32_e32 v2, v39, v2 +; GCN-NEXT: v_or_b32_e32 v3, v48, v3 +; GCN-NEXT: v_or_b32_e32 v4, v49, v4 +; GCN-NEXT: v_or_b32_e32 v5, v50, v5 +; GCN-NEXT: v_or_b32_e32 v6, v51, v6 +; GCN-NEXT: v_or_b32_e32 v7, v52, v7 +; GCN-NEXT: v_or_b32_e32 v8, v17, v8 +; GCN-NEXT: v_or_b32_e32 v9, v19, v9 +; GCN-NEXT: v_or_b32_e32 v10, v21, v10 +; GCN-NEXT: v_or_b32_e32 v11, v23, v11 +; GCN-NEXT: v_or_b32_e32 v12, v25, v12 +; GCN-NEXT: v_or_b32_e32 v13, v27, v13 +; GCN-NEXT: v_or_b32_e32 v14, v29, v14 +; GCN-NEXT: v_or_b32_e32 v15, v53, v15 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32i16_to_v8f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB134_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v17, 3 +; VI-NEXT: v_add_u16_e32 v16, 3, v15 +; VI-NEXT: v_add_u16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_u16_e32 v16, 3, v14 +; VI-NEXT: v_add_u16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v16, v14 +; VI-NEXT: v_add_u16_e32 v16, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v16, v13 +; VI-NEXT: v_add_u16_e32 v16, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v16, v12 +; VI-NEXT: v_add_u16_e32 v16, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v16, v11 +; VI-NEXT: v_add_u16_e32 v16, 3, v10 +; VI-NEXT: v_add_u16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v16, v10 +; VI-NEXT: v_add_u16_e32 v16, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v16, v9 +; VI-NEXT: v_add_u16_e32 v16, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v16, v8 +; VI-NEXT: v_add_u16_e32 v16, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v16, v7 +; VI-NEXT: v_add_u16_e32 v16, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v16, v6 +; VI-NEXT: v_add_u16_e32 v16, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v16, v5 +; VI-NEXT: v_add_u16_e32 v16, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v16, v4 +; VI-NEXT: v_add_u16_e32 v16, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v16, v3 +; VI-NEXT: v_add_u16_e32 v16, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v16, v2 +; VI-NEXT: v_add_u16_e32 v16, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v16, v1 +; VI-NEXT: v_add_u16_e32 v16, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: .LBB134_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32i16_to_v8f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB134_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB134_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32i16_to_v8f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB134_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB134_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <8 x double> + br label %end +cmp.false: + %a3 = bitcast <32 x i16> %a to <8 x double> + br label %end +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + -; CHECK-LABEL: {{^}}v_bitcast_v32f32_to_v64bf16: define void @v_bitcast_v32f32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x float> %value) { +; GCN-LABEL: v_bitcast_v32f32_to_v64bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v58, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v59, 0 +; GCN-NEXT: v_mov_b32_e32 v56, 0 +; GCN-NEXT: v_mov_b32_e32 v57, 0 +; GCN-NEXT: v_mov_b32_e32 v46, 0 +; GCN-NEXT: v_mov_b32_e32 v47, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 +; GCN-NEXT: v_mov_b32_e32 v45, 0 +; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: v_mov_b32_e32 v38, 0 +; GCN-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v37, 0 +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: v_mov_b32_e32 v35, 0 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB135_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v62 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v61 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v17 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v15 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v14 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v12 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v8 +; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v6 +; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v5 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; GCN-NEXT: .LBB135_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v59 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v58 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v57 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v56 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v46 +; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v45 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v44 +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v42 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54 +; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v52 +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v48 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v38 +; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v36 +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v34 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v32 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32f32_to_v64bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v50, s19 +; VI-NEXT: v_mov_b32_e32 v49, s18 +; VI-NEXT: v_mov_b32_e32 v48, s17 +; VI-NEXT: v_mov_b32_e32 v47, s16 +; VI-NEXT: v_mov_b32_e32 v46, s15 +; VI-NEXT: v_mov_b32_e32 v45, s14 +; VI-NEXT: v_mov_b32_e32 v44, s13 +; VI-NEXT: v_mov_b32_e32 v43, s12 +; VI-NEXT: v_mov_b32_e32 v42, s11 +; VI-NEXT: v_mov_b32_e32 v41, s10 +; VI-NEXT: v_mov_b32_e32 v40, s9 +; VI-NEXT: v_mov_b32_e32 v39, s8 +; VI-NEXT: v_mov_b32_e32 v38, s7 +; VI-NEXT: v_mov_b32_e32 v37, s6 +; VI-NEXT: v_mov_b32_e32 v36, s5 +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB135_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v50, v18 +; VI-NEXT: v_mov_b32_e32 v49, v17 +; VI-NEXT: v_mov_b32_e32 v48, v16 +; VI-NEXT: v_mov_b32_e32 v47, v15 +; VI-NEXT: v_mov_b32_e32 v46, v14 +; VI-NEXT: v_mov_b32_e32 v45, v13 +; VI-NEXT: v_mov_b32_e32 v44, v12 +; VI-NEXT: v_mov_b32_e32 v43, v11 +; VI-NEXT: v_mov_b32_e32 v42, v10 +; VI-NEXT: v_mov_b32_e32 v41, v9 +; VI-NEXT: v_mov_b32_e32 v40, v8 +; VI-NEXT: v_mov_b32_e32 v39, v7 +; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v36, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: .LBB135_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38] +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: s_movk_i32 s4, 0x70 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x60 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x50 +; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] +; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32f32_to_v64bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v50, s19 +; GFX9-NEXT: v_mov_b32_e32 v49, s18 +; GFX9-NEXT: v_mov_b32_e32 v48, s17 +; GFX9-NEXT: v_mov_b32_e32 v47, s16 +; GFX9-NEXT: v_mov_b32_e32 v46, s15 +; GFX9-NEXT: v_mov_b32_e32 v45, s14 +; GFX9-NEXT: v_mov_b32_e32 v44, s13 +; GFX9-NEXT: v_mov_b32_e32 v43, s12 +; GFX9-NEXT: v_mov_b32_e32 v42, s11 +; GFX9-NEXT: v_mov_b32_e32 v41, s10 +; GFX9-NEXT: v_mov_b32_e32 v40, s9 +; GFX9-NEXT: v_mov_b32_e32 v39, s8 +; GFX9-NEXT: v_mov_b32_e32 v38, s7 +; GFX9-NEXT: v_mov_b32_e32 v37, s6 +; GFX9-NEXT: v_mov_b32_e32 v36, s5 +; GFX9-NEXT: v_mov_b32_e32 v35, s4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB135_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v50, v18 +; GFX9-NEXT: v_mov_b32_e32 v49, v17 +; GFX9-NEXT: v_mov_b32_e32 v48, v16 +; GFX9-NEXT: v_mov_b32_e32 v47, v15 +; GFX9-NEXT: v_mov_b32_e32 v46, v14 +; GFX9-NEXT: v_mov_b32_e32 v45, v13 +; GFX9-NEXT: v_mov_b32_e32 v44, v12 +; GFX9-NEXT: v_mov_b32_e32 v43, v11 +; GFX9-NEXT: v_mov_b32_e32 v42, v10 +; GFX9-NEXT: v_mov_b32_e32 v41, v9 +; GFX9-NEXT: v_mov_b32_e32 v40, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v7 +; GFX9-NEXT: v_mov_b32_e32 v38, v6 +; GFX9-NEXT: v_mov_b32_e32 v37, v5 +; GFX9-NEXT: v_mov_b32_e32 v36, v4 +; GFX9-NEXT: v_mov_b32_e32 v35, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB135_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32f32_to_v64bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14 +; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0 +; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12 +; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10 +; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8 +; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6 +; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4 +; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2 +; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56 +; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54 +; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58 +; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60 +; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62 +; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64 +; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB135_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 +; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15 +; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13 +; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11 +; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9 +; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7 +; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5 +; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3 +; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 +; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29 +; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27 +; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25 +; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23 +; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21 +; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19 +; GFX11-NEXT: .LBB135_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off +; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112 +; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96 +; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80 +; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1969,8 +24077,958 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v32i32_to_v64bf16: + define void @v_bitcast_v32i32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x i32> %value) { +; GCN-LABEL: v_bitcast_v32i32_to_v64bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v58, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v59, 0 +; GCN-NEXT: v_mov_b32_e32 v56, 0 +; GCN-NEXT: v_mov_b32_e32 v57, 0 +; GCN-NEXT: v_mov_b32_e32 v46, 0 +; GCN-NEXT: v_mov_b32_e32 v47, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 +; GCN-NEXT: v_mov_b32_e32 v45, 0 +; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: v_mov_b32_e32 v38, 0 +; GCN-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v37, 0 +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: v_mov_b32_e32 v35, 0 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB136_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v62 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v61 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v17 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v15 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v14 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v12 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v8 +; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v6 +; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v5 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; GCN-NEXT: .LBB136_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v59 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v58 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v57 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v56 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v46 +; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v45 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v44 +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v42 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54 +; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v52 +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v48 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v38 +; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v36 +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v34 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v32 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v32i32_to_v64bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v50, s19 +; VI-NEXT: v_mov_b32_e32 v49, s18 +; VI-NEXT: v_mov_b32_e32 v48, s17 +; VI-NEXT: v_mov_b32_e32 v47, s16 +; VI-NEXT: v_mov_b32_e32 v46, s15 +; VI-NEXT: v_mov_b32_e32 v45, s14 +; VI-NEXT: v_mov_b32_e32 v44, s13 +; VI-NEXT: v_mov_b32_e32 v43, s12 +; VI-NEXT: v_mov_b32_e32 v42, s11 +; VI-NEXT: v_mov_b32_e32 v41, s10 +; VI-NEXT: v_mov_b32_e32 v40, s9 +; VI-NEXT: v_mov_b32_e32 v39, s8 +; VI-NEXT: v_mov_b32_e32 v38, s7 +; VI-NEXT: v_mov_b32_e32 v37, s6 +; VI-NEXT: v_mov_b32_e32 v36, s5 +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB136_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v50, v18 +; VI-NEXT: v_mov_b32_e32 v49, v17 +; VI-NEXT: v_mov_b32_e32 v48, v16 +; VI-NEXT: v_mov_b32_e32 v47, v15 +; VI-NEXT: v_mov_b32_e32 v46, v14 +; VI-NEXT: v_mov_b32_e32 v45, v13 +; VI-NEXT: v_mov_b32_e32 v44, v12 +; VI-NEXT: v_mov_b32_e32 v43, v11 +; VI-NEXT: v_mov_b32_e32 v42, v10 +; VI-NEXT: v_mov_b32_e32 v41, v9 +; VI-NEXT: v_mov_b32_e32 v40, v8 +; VI-NEXT: v_mov_b32_e32 v39, v7 +; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v36, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: .LBB136_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38] +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: s_movk_i32 s4, 0x70 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x60 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x50 +; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] +; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v32i32_to_v64bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v50, s19 +; GFX9-NEXT: v_mov_b32_e32 v49, s18 +; GFX9-NEXT: v_mov_b32_e32 v48, s17 +; GFX9-NEXT: v_mov_b32_e32 v47, s16 +; GFX9-NEXT: v_mov_b32_e32 v46, s15 +; GFX9-NEXT: v_mov_b32_e32 v45, s14 +; GFX9-NEXT: v_mov_b32_e32 v44, s13 +; GFX9-NEXT: v_mov_b32_e32 v43, s12 +; GFX9-NEXT: v_mov_b32_e32 v42, s11 +; GFX9-NEXT: v_mov_b32_e32 v41, s10 +; GFX9-NEXT: v_mov_b32_e32 v40, s9 +; GFX9-NEXT: v_mov_b32_e32 v39, s8 +; GFX9-NEXT: v_mov_b32_e32 v38, s7 +; GFX9-NEXT: v_mov_b32_e32 v37, s6 +; GFX9-NEXT: v_mov_b32_e32 v36, s5 +; GFX9-NEXT: v_mov_b32_e32 v35, s4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB136_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v50, v18 +; GFX9-NEXT: v_mov_b32_e32 v49, v17 +; GFX9-NEXT: v_mov_b32_e32 v48, v16 +; GFX9-NEXT: v_mov_b32_e32 v47, v15 +; GFX9-NEXT: v_mov_b32_e32 v46, v14 +; GFX9-NEXT: v_mov_b32_e32 v45, v13 +; GFX9-NEXT: v_mov_b32_e32 v44, v12 +; GFX9-NEXT: v_mov_b32_e32 v43, v11 +; GFX9-NEXT: v_mov_b32_e32 v42, v10 +; GFX9-NEXT: v_mov_b32_e32 v41, v9 +; GFX9-NEXT: v_mov_b32_e32 v40, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v7 +; GFX9-NEXT: v_mov_b32_e32 v38, v6 +; GFX9-NEXT: v_mov_b32_e32 v37, v5 +; GFX9-NEXT: v_mov_b32_e32 v36, v4 +; GFX9-NEXT: v_mov_b32_e32 v35, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB136_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v32i32_to_v64bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14 +; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0 +; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12 +; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10 +; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8 +; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6 +; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4 +; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2 +; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56 +; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54 +; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58 +; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60 +; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62 +; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64 +; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB136_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 +; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15 +; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13 +; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11 +; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9 +; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7 +; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5 +; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3 +; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 +; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29 +; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27 +; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25 +; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23 +; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21 +; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19 +; GFX11-NEXT: .LBB136_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off +; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112 +; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96 +; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80 +; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -1985,8 +25043,1150 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v64i16_to_v64bf16: + define void @v_bitcast_v64i16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x i16> %value) { +; GCN-LABEL: v_bitcast_v64i16_to_v64bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v57, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v59, 0 +; GCN-NEXT: v_mov_b32_e32 v56, 0 +; GCN-NEXT: v_mov_b32_e32 v58, 0 +; GCN-NEXT: v_mov_b32_e32 v45, 0 +; GCN-NEXT: v_mov_b32_e32 v47, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 +; GCN-NEXT: v_mov_b32_e32 v46, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB137_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v63 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v62 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: .LBB137_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v59 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v57 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v58 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v56 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v47 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v45 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v44 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v19 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v64i16_to_v64bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v50, s19 +; VI-NEXT: v_mov_b32_e32 v49, s18 +; VI-NEXT: v_mov_b32_e32 v48, s17 +; VI-NEXT: v_mov_b32_e32 v47, s16 +; VI-NEXT: v_mov_b32_e32 v46, s15 +; VI-NEXT: v_mov_b32_e32 v45, s14 +; VI-NEXT: v_mov_b32_e32 v44, s13 +; VI-NEXT: v_mov_b32_e32 v43, s12 +; VI-NEXT: v_mov_b32_e32 v42, s11 +; VI-NEXT: v_mov_b32_e32 v41, s10 +; VI-NEXT: v_mov_b32_e32 v40, s9 +; VI-NEXT: v_mov_b32_e32 v39, s8 +; VI-NEXT: v_mov_b32_e32 v38, s7 +; VI-NEXT: v_mov_b32_e32 v37, s6 +; VI-NEXT: v_mov_b32_e32 v36, s5 +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB137_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v50, v18 +; VI-NEXT: v_mov_b32_e32 v49, v17 +; VI-NEXT: v_mov_b32_e32 v48, v16 +; VI-NEXT: v_mov_b32_e32 v47, v15 +; VI-NEXT: v_mov_b32_e32 v46, v14 +; VI-NEXT: v_mov_b32_e32 v45, v13 +; VI-NEXT: v_mov_b32_e32 v44, v12 +; VI-NEXT: v_mov_b32_e32 v43, v11 +; VI-NEXT: v_mov_b32_e32 v42, v10 +; VI-NEXT: v_mov_b32_e32 v41, v9 +; VI-NEXT: v_mov_b32_e32 v40, v8 +; VI-NEXT: v_mov_b32_e32 v39, v7 +; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v36, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: .LBB137_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38] +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: s_movk_i32 s4, 0x70 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x60 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x50 +; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] +; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v64i16_to_v64bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v50, s19 +; GFX9-NEXT: v_mov_b32_e32 v49, s18 +; GFX9-NEXT: v_mov_b32_e32 v48, s17 +; GFX9-NEXT: v_mov_b32_e32 v47, s16 +; GFX9-NEXT: v_mov_b32_e32 v46, s15 +; GFX9-NEXT: v_mov_b32_e32 v45, s14 +; GFX9-NEXT: v_mov_b32_e32 v44, s13 +; GFX9-NEXT: v_mov_b32_e32 v43, s12 +; GFX9-NEXT: v_mov_b32_e32 v42, s11 +; GFX9-NEXT: v_mov_b32_e32 v41, s10 +; GFX9-NEXT: v_mov_b32_e32 v40, s9 +; GFX9-NEXT: v_mov_b32_e32 v39, s8 +; GFX9-NEXT: v_mov_b32_e32 v38, s7 +; GFX9-NEXT: v_mov_b32_e32 v37, s6 +; GFX9-NEXT: v_mov_b32_e32 v36, s5 +; GFX9-NEXT: v_mov_b32_e32 v35, s4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB137_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v50, v18 +; GFX9-NEXT: v_mov_b32_e32 v49, v17 +; GFX9-NEXT: v_mov_b32_e32 v48, v16 +; GFX9-NEXT: v_mov_b32_e32 v47, v15 +; GFX9-NEXT: v_mov_b32_e32 v46, v14 +; GFX9-NEXT: v_mov_b32_e32 v45, v13 +; GFX9-NEXT: v_mov_b32_e32 v44, v12 +; GFX9-NEXT: v_mov_b32_e32 v43, v11 +; GFX9-NEXT: v_mov_b32_e32 v42, v10 +; GFX9-NEXT: v_mov_b32_e32 v41, v9 +; GFX9-NEXT: v_mov_b32_e32 v40, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v7 +; GFX9-NEXT: v_mov_b32_e32 v38, v6 +; GFX9-NEXT: v_mov_b32_e32 v37, v5 +; GFX9-NEXT: v_mov_b32_e32 v36, v4 +; GFX9-NEXT: v_mov_b32_e32 v35, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB137_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v64i16_to_v64bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14 +; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0 +; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12 +; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10 +; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8 +; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6 +; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4 +; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2 +; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56 +; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54 +; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58 +; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60 +; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62 +; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64 +; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB137_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 +; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15 +; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13 +; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11 +; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9 +; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7 +; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5 +; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3 +; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 +; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29 +; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27 +; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25 +; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23 +; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21 +; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19 +; GFX11-NEXT: .LBB137_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off +; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112 +; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96 +; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80 +; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -2001,8 +26201,1214 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v64f16_to_v64bf16: + define void @v_bitcast_v64f16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x half> %value) { +; GCN-LABEL: v_bitcast_v64f16_to_v64bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v33, v16 +; GCN-NEXT: v_mov_b32_e32 v16, v15 +; GCN-NEXT: v_mov_b32_e32 v15, v14 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v57, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v59, 0 +; GCN-NEXT: v_mov_b32_e32 v56, 0 +; GCN-NEXT: v_mov_b32_e32 v58, 0 +; GCN-NEXT: v_mov_b32_e32 v45, 0 +; GCN-NEXT: v_mov_b32_e32 v47, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 +; GCN-NEXT: v_mov_b32_e32 v46, 0 +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: v_mov_b32_e32 v62, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB138_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_cvt_f16_f32_e32 v57, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v59, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v56, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v58, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v47, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v46, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v13 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v15 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v62, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v63, v63 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v18 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v20 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v51 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v52 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v49 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v23 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v31 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v40 +; GCN-NEXT: v_mov_b32_e32 v40, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v42 +; GCN-NEXT: v_mov_b32_e32 v42, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; GCN-NEXT: .LBB138_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v59 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v57 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v58 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v56 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v47 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v45 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v44 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v42 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v40 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v52 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v51 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v34 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v64f16_to_v64bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v50, s19 +; VI-NEXT: v_mov_b32_e32 v49, s18 +; VI-NEXT: v_mov_b32_e32 v48, s17 +; VI-NEXT: v_mov_b32_e32 v47, s16 +; VI-NEXT: v_mov_b32_e32 v46, s15 +; VI-NEXT: v_mov_b32_e32 v45, s14 +; VI-NEXT: v_mov_b32_e32 v44, s13 +; VI-NEXT: v_mov_b32_e32 v43, s12 +; VI-NEXT: v_mov_b32_e32 v42, s11 +; VI-NEXT: v_mov_b32_e32 v41, s10 +; VI-NEXT: v_mov_b32_e32 v40, s9 +; VI-NEXT: v_mov_b32_e32 v39, s8 +; VI-NEXT: v_mov_b32_e32 v38, s7 +; VI-NEXT: v_mov_b32_e32 v37, s6 +; VI-NEXT: v_mov_b32_e32 v36, s5 +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB138_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v50, v18 +; VI-NEXT: v_mov_b32_e32 v49, v17 +; VI-NEXT: v_mov_b32_e32 v48, v16 +; VI-NEXT: v_mov_b32_e32 v47, v15 +; VI-NEXT: v_mov_b32_e32 v46, v14 +; VI-NEXT: v_mov_b32_e32 v45, v13 +; VI-NEXT: v_mov_b32_e32 v44, v12 +; VI-NEXT: v_mov_b32_e32 v43, v11 +; VI-NEXT: v_mov_b32_e32 v42, v10 +; VI-NEXT: v_mov_b32_e32 v41, v9 +; VI-NEXT: v_mov_b32_e32 v40, v8 +; VI-NEXT: v_mov_b32_e32 v39, v7 +; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v36, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: .LBB138_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38] +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: s_movk_i32 s4, 0x70 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x60 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x50 +; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] +; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v64f16_to_v64bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v50, s19 +; GFX9-NEXT: v_mov_b32_e32 v49, s18 +; GFX9-NEXT: v_mov_b32_e32 v48, s17 +; GFX9-NEXT: v_mov_b32_e32 v47, s16 +; GFX9-NEXT: v_mov_b32_e32 v46, s15 +; GFX9-NEXT: v_mov_b32_e32 v45, s14 +; GFX9-NEXT: v_mov_b32_e32 v44, s13 +; GFX9-NEXT: v_mov_b32_e32 v43, s12 +; GFX9-NEXT: v_mov_b32_e32 v42, s11 +; GFX9-NEXT: v_mov_b32_e32 v41, s10 +; GFX9-NEXT: v_mov_b32_e32 v40, s9 +; GFX9-NEXT: v_mov_b32_e32 v39, s8 +; GFX9-NEXT: v_mov_b32_e32 v38, s7 +; GFX9-NEXT: v_mov_b32_e32 v37, s6 +; GFX9-NEXT: v_mov_b32_e32 v36, s5 +; GFX9-NEXT: v_mov_b32_e32 v35, s4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB138_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v50, v18 +; GFX9-NEXT: v_mov_b32_e32 v49, v17 +; GFX9-NEXT: v_mov_b32_e32 v48, v16 +; GFX9-NEXT: v_mov_b32_e32 v47, v15 +; GFX9-NEXT: v_mov_b32_e32 v46, v14 +; GFX9-NEXT: v_mov_b32_e32 v45, v13 +; GFX9-NEXT: v_mov_b32_e32 v44, v12 +; GFX9-NEXT: v_mov_b32_e32 v43, v11 +; GFX9-NEXT: v_mov_b32_e32 v42, v10 +; GFX9-NEXT: v_mov_b32_e32 v41, v9 +; GFX9-NEXT: v_mov_b32_e32 v40, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v7 +; GFX9-NEXT: v_mov_b32_e32 v38, v6 +; GFX9-NEXT: v_mov_b32_e32 v37, v5 +; GFX9-NEXT: v_mov_b32_e32 v36, v4 +; GFX9-NEXT: v_mov_b32_e32 v35, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB138_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v64f16_to_v64bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14 +; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0 +; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12 +; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10 +; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8 +; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6 +; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4 +; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2 +; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56 +; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54 +; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58 +; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60 +; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62 +; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64 +; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB138_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 +; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15 +; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13 +; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11 +; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9 +; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7 +; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5 +; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3 +; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 +; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29 +; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27 +; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25 +; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23 +; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21 +; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19 +; GFX11-NEXT: .LBB138_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off +; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112 +; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96 +; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80 +; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -2017,8 +27423,3201 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v128i8_to_v64bf16: + define void @v_bitcast_v128i8_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <128 x i8> %value) { +; GCN-LABEL: v_bitcast_v128i8_to_v64bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:224 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:192 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v56, 0 +; GCN-NEXT: v_mov_b32_e32 v62, 0 +; GCN-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v47, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v61, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: v_mov_b32_e32 v60, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v46, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB139_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v8 +; GCN-NEXT: v_or_b32_e32 v7, v0, v3 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v8, v0, v3 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v24, v0, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v39 +; GCN-NEXT: v_or_b32_e32 v0, v0, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v38 +; GCN-NEXT: v_or_b32_e32 v23, v3, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v35 +; GCN-NEXT: v_or_b32_e32 v17, v3, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v15 +; GCN-NEXT: v_or_b32_e32 v18, v3, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v63 +; GCN-NEXT: v_or_b32_e32 v21, v3, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v59 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v58 +; GCN-NEXT: v_or_b32_e32 v25, v3, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v42 +; GCN-NEXT: v_or_b32_e32 v30, v3, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v45 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-NEXT: v_or_b32_e32 v35, v3, v4 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-NEXT: v_or_b32_e32 v15, v3, v4 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-NEXT: v_or_b32_e32 v42, v3, v4 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-NEXT: v_or_b32_e32 v45, v3, v4 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-NEXT: v_or_b32_e32 v32, v3, v4 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-NEXT: v_or_b32_e32 v57, v3, v4 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v22, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v12, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v14, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v56, v5, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v33, v5, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v10, v5, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v27, v5, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v5, v5, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v52, v6, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v9, v6, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v40, v6, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v36, v6, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v26, v6, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v6, v3 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v31, v11, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v28, v11, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v6, v11, v6 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v13, v13, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v61, v16, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v11, v16, v11 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v41, v19, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v60, v19, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v44, v19, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v19, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v19, v19, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v54, v20, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v20, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v20, v20, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v55, v29, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v51, v29, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v50, v29, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: .LBB139_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v22 +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_alignbit_b32 v48, v7, v8, 16 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v16 +; GCN-NEXT: v_alignbit_b32 v49, v7, v8, 16 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mov_b32_e32 v12, v50 +; GCN-NEXT: v_alignbit_b32 v50, v7, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v62 +; GCN-NEXT: v_mov_b32_e32 v14, v51 +; GCN-NEXT: v_alignbit_b32 v51, v4, v7, 16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v10 +; GCN-NEXT: v_alignbit_b32 v7, v4, v7, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47 +; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v52 +; GCN-NEXT: v_alignbit_b32 v9, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v43 +; GCN-NEXT: v_alignbit_b32 v10, v4, v5, 16 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v36 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v23 +; GCN-NEXT: v_alignbit_b32 v8, v3, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v31 +; GCN-NEXT: v_alignbit_b32 v9, v3, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24 +; GCN-NEXT: v_alignbit_b32 v10, v3, v4, 16 +; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v61 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v13 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v53 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v60 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v41 +; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_alignbit_b32 v6, v6, v0, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v17 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v18 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v30 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v46 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v128i8_to_v64bf16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:396 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v46, s19 +; VI-NEXT: v_mov_b32_e32 v45, s18 +; VI-NEXT: v_mov_b32_e32 v44, s17 +; VI-NEXT: v_mov_b32_e32 v43, s16 +; VI-NEXT: v_mov_b32_e32 v42, s15 +; VI-NEXT: v_mov_b32_e32 v41, s14 +; VI-NEXT: v_mov_b32_e32 v40, s13 +; VI-NEXT: v_mov_b32_e32 v39, s12 +; VI-NEXT: v_mov_b32_e32 v38, s11 +; VI-NEXT: v_mov_b32_e32 v37, s10 +; VI-NEXT: v_mov_b32_e32 v36, s9 +; VI-NEXT: v_mov_b32_e32 v35, s8 +; VI-NEXT: v_mov_b32_e32 v34, s7 +; VI-NEXT: v_mov_b32_e32 v33, s6 +; VI-NEXT: v_mov_b32_e32 v32, s5 +; VI-NEXT: v_mov_b32_e32 v31, s4 +; VI-NEXT: v_mov_b32_e32 v62, v46 +; VI-NEXT: v_mov_b32_e32 v61, v45 +; VI-NEXT: v_mov_b32_e32 v60, v44 +; VI-NEXT: v_mov_b32_e32 v59, v43 +; VI-NEXT: v_mov_b32_e32 v58, v42 +; VI-NEXT: v_mov_b32_e32 v57, v41 +; VI-NEXT: v_mov_b32_e32 v56, v40 +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_mov_b32_e32 v54, v38 +; VI-NEXT: v_mov_b32_e32 v53, v37 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: v_mov_b32_e32 v51, v35 +; VI-NEXT: v_mov_b32_e32 v50, v34 +; VI-NEXT: v_mov_b32_e32 v49, v33 +; VI-NEXT: v_mov_b32_e32 v48, v32 +; VI-NEXT: v_mov_b32_e32 v47, v31 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:392 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:388 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:376 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:372 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:368 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:364 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:360 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:348 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:344 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:340 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:336 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:332 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:280 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:240 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:232 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:216 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:212 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:208 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:204 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:200 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:188 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:168 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:160 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:152 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:144 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:136 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:128 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:120 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB139_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v28 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v12, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v31 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32 +; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; VI-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v33, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; VI-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v33, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v35 +; VI-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v36 +; VI-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v35, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v36 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v37 +; VI-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v38 +; VI-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v37, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v24 +; VI-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v23 +; VI-NEXT: v_or_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v40, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v26 +; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v41, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v17 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v25 +; VI-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v42, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v20 +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v19 +; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v43, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v63 +; VI-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v44, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v45, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v46, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v47, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v50, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v51, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v52, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v53, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v54, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v55, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v56, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v57, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v58, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v59, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v60, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v61, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v62, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: .LBB139_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x70 +; VI-NEXT: flat_store_dwordx4 v[3:4], v[35:38] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[31:34] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x60 +; VI-NEXT: flat_store_dwordx4 v[3:4], v[59:62] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x50 +; VI-NEXT: flat_store_dwordx4 v[3:4], v[55:58] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[51:54] +; VI-NEXT: flat_store_dwordx4 v[0:1], v[47:50] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v128i8_to_v64bf16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:396 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v46, s19 +; GFX9-NEXT: v_mov_b32_e32 v45, s18 +; GFX9-NEXT: v_mov_b32_e32 v44, s17 +; GFX9-NEXT: v_mov_b32_e32 v43, s16 +; GFX9-NEXT: v_mov_b32_e32 v42, s15 +; GFX9-NEXT: v_mov_b32_e32 v41, s14 +; GFX9-NEXT: v_mov_b32_e32 v40, s13 +; GFX9-NEXT: v_mov_b32_e32 v39, s12 +; GFX9-NEXT: v_mov_b32_e32 v38, s11 +; GFX9-NEXT: v_mov_b32_e32 v37, s10 +; GFX9-NEXT: v_mov_b32_e32 v36, s9 +; GFX9-NEXT: v_mov_b32_e32 v35, s8 +; GFX9-NEXT: v_mov_b32_e32 v34, s7 +; GFX9-NEXT: v_mov_b32_e32 v33, s6 +; GFX9-NEXT: v_mov_b32_e32 v32, s5 +; GFX9-NEXT: v_mov_b32_e32 v31, s4 +; GFX9-NEXT: v_mov_b32_e32 v62, v46 +; GFX9-NEXT: v_mov_b32_e32 v61, v45 +; GFX9-NEXT: v_mov_b32_e32 v60, v44 +; GFX9-NEXT: v_mov_b32_e32 v59, v43 +; GFX9-NEXT: v_mov_b32_e32 v58, v42 +; GFX9-NEXT: v_mov_b32_e32 v57, v41 +; GFX9-NEXT: v_mov_b32_e32 v56, v40 +; GFX9-NEXT: v_mov_b32_e32 v55, v39 +; GFX9-NEXT: v_mov_b32_e32 v54, v38 +; GFX9-NEXT: v_mov_b32_e32 v53, v37 +; GFX9-NEXT: v_mov_b32_e32 v52, v36 +; GFX9-NEXT: v_mov_b32_e32 v51, v35 +; GFX9-NEXT: v_mov_b32_e32 v50, v34 +; GFX9-NEXT: v_mov_b32_e32 v49, v33 +; GFX9-NEXT: v_mov_b32_e32 v48, v32 +; GFX9-NEXT: v_mov_b32_e32 v47, v31 +; GFX9-NEXT: s_waitcnt vmcnt(44) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:392 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:388 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:376 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:372 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:368 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:364 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:360 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:344 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:340 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:336 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:280 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:232 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:216 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:212 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:208 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:204 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:200 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:188 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:168 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:160 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:152 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:144 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:136 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:120 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB139_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v32, v34, v33, s6 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v33, v34, v33, s6 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v34, v35, v34, s6 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v35, v36, v35, s6 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v36, v37, v36, s6 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v37, v38, v37, s6 +; GFX9-NEXT: v_perm_b32 v38, v11, v12, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v30 +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v39, v12, v11, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v40, v12, v11, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v41, v4, v11, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v42, v4, v3, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v20 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v43, v4, v3, s6 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v44, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v45, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v46, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v47, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v48, v5, v4, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v49, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v50, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v51, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v52, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v53, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v54, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v55, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v56, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v57, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v58, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v59, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v60, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v61, v3, v0, s6 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v62, v3, v0, s6 +; GFX9-NEXT: .LBB139_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[59:62], off offset:112 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[55:58], off offset:96 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[51:54], off offset:80 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:64 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v128i8_to_v64bf16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:600 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:596 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:592 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:588 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:584 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:580 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:576 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:572 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:568 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:564 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:560 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:556 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:552 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:548 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:544 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:540 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:536 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:532 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:528 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:524 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:520 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:516 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:512 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:508 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:504 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:500 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:496 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:492 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:488 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:484 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:480 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:476 +; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:472 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:468 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:464 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:460 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:456 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:452 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:448 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:444 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:440 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:436 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:432 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:428 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:424 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:420 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:416 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:412 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:408 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:404 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:400 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:396 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:392 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:388 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:384 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:380 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:376 +; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:372 +; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:368 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:364 +; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:360 +; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:356 +; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:352 +; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:348 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:344 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:340 +; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:336 +; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:332 +; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:328 +; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:324 +; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:320 +; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:316 +; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:312 +; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:308 +; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:304 +; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:300 +; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:296 +; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:292 +; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:288 +; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:284 +; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:280 +; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:276 +; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:272 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:268 +; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:264 +; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:260 +; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:256 +; GFX11-NEXT: scratch_load_u16 v132, off, s32 offset:252 +; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:248 +; GFX11-NEXT: scratch_load_u16 v134, off, s32 offset:244 +; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:240 +; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:236 +; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:232 +; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:228 +; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:224 +; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:220 +; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:216 +; GFX11-NEXT: scratch_load_u16 v150, off, s32 offset:212 +; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:208 +; GFX11-NEXT: scratch_load_u16 v160, off, s32 offset:204 +; GFX11-NEXT: scratch_load_u16 v161, off, s32 offset:200 +; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:196 +; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:192 +; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:188 +; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:184 +; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:180 +; GFX11-NEXT: scratch_load_u16 v167, off, s32 offset:176 +; GFX11-NEXT: scratch_load_u16 v176, off, s32 offset:172 +; GFX11-NEXT: scratch_load_u16 v177, off, s32 offset:168 +; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:164 +; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:160 +; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:156 +; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:152 +; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:148 +; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:144 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v63, off, s32 offset:140 +; GFX11-NEXT: scratch_load_u16 v72, off, s32 offset:136 +; GFX11-NEXT: scratch_load_u16 v73, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v79, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v88, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v89, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v90, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v91, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:88 +; GFX11-NEXT: scratch_load_u16 v93, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v95, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v104, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v105, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v106, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v107, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v108, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v109, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v110, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v111, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v120, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v121, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v122, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v123, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v124, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v125, off, s32 offset:20 +; GFX11-NEXT: scratch_load_u16 v126, off, s32 offset:16 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_u16 v127, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v136, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v137, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u16 v138, off, s32 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v46, s15 :: v_dual_mov_b32 v45, s14 +; GFX11-NEXT: v_dual_mov_b32 v44, s13 :: v_dual_mov_b32 v43, s12 +; GFX11-NEXT: v_dual_mov_b32 v42, s11 :: v_dual_mov_b32 v41, s10 +; GFX11-NEXT: v_dual_mov_b32 v40, s9 :: v_dual_mov_b32 v39, s8 +; GFX11-NEXT: v_dual_mov_b32 v38, s7 :: v_dual_mov_b32 v37, s6 +; GFX11-NEXT: v_dual_mov_b32 v36, s5 :: v_dual_mov_b32 v35, s4 +; GFX11-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v33, s2 +; GFX11-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v31, s0 +; GFX11-NEXT: v_dual_mov_b32 v62, v46 :: v_dual_mov_b32 v61, v45 +; GFX11-NEXT: v_dual_mov_b32 v60, v44 :: v_dual_mov_b32 v59, v43 +; GFX11-NEXT: v_dual_mov_b32 v58, v42 :: v_dual_mov_b32 v57, v41 +; GFX11-NEXT: v_dual_mov_b32 v56, v40 :: v_dual_mov_b32 v55, v39 +; GFX11-NEXT: v_dual_mov_b32 v54, v38 :: v_dual_mov_b32 v53, v37 +; GFX11-NEXT: v_dual_mov_b32 v52, v36 :: v_dual_mov_b32 v51, v35 +; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 +; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB139_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v8 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v16 +; GFX11-NEXT: v_perm_b32 v31, v3, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v12 +; GFX11-NEXT: v_perm_b32 v32, v5, v4, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v14 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v18 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v19 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v20 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v21 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v22 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v24 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v26 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v28 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v30 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_perm_b32 v33, v3, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v34, v5, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v35, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v36, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v37, v11, v10, 0x5040100 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v138 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v137 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v136 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v127 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v126 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v125 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v124 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v123 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v122 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v121 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v120 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v111 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v110 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v109 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v108 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v107 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v106 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v105 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v104 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v95 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_perm_b32 v38, v3, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v39, v5, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v40, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v41, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v42, v11, v10, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v94 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v93 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v92 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v91 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v90 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v89 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v88 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v79 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v78 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v77 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v76 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v75 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v74 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v73 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v72 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v63 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v183 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v182 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v181 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v180 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_perm_b32 v43, v3, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v44, v5, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v45, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v46, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v47, v11, v10, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v178 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v177 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v176 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v167 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v166 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v165 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v164 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v163 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v162 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v161 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v160 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v151 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v150 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v149 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v148 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v147 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v146 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v145 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v144 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_perm_b32 v48, v3, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v49, v5, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v50, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v51, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v52, v11, v10, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v135 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v134 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v133 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v132 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v131 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v130 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v129 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v128 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v119 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v118 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v117 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v116 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v115 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v114 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v113 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v112 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v103 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v102 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v101 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v100 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_perm_b32 v53, v3, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v54, v5, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v55, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v56, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v57, v11, v10, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v98 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v97 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v87 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v86 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v85 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v83 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v82 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v81 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v71 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v70 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v69 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v68 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v67 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v65 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v64 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_perm_b32 v58, v3, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v59, v5, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v60, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v61, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v62, v11, v10, 0x5040100 +; GFX11-NEXT: .LBB139_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off +; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:112 +; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:96 +; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:80 +; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:64 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:400 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:404 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:408 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:412 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:416 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:420 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:424 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:428 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:432 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:436 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:440 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:444 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:448 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:452 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:456 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:460 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:464 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:468 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:472 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:476 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:480 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:484 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:488 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:492 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:496 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:500 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:504 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:508 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:512 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:516 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:520 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:524 +; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:528 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:532 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:536 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:540 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:544 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:548 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:552 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:556 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:560 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:564 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:568 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:572 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:576 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:580 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:584 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:588 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:592 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:596 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:600 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -2033,8 +30632,914 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v64i16: + define void @v_bitcast_v64bf16_to_v64i16(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v64bf16_to_v64i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v63, s36, 0 +; GCN-NEXT: v_writelane_b32 v63, s37, 1 +; GCN-NEXT: v_writelane_b32 v63, s38, 2 +; GCN-NEXT: v_writelane_b32 v63, s39, 3 +; GCN-NEXT: v_writelane_b32 v63, s48, 4 +; GCN-NEXT: v_writelane_b32 v63, s49, 5 +; GCN-NEXT: v_writelane_b32 v63, s50, 6 +; GCN-NEXT: v_writelane_b32 v63, s51, 7 +; GCN-NEXT: v_writelane_b32 v63, s52, 8 +; GCN-NEXT: v_writelane_b32 v63, s53, 9 +; GCN-NEXT: v_writelane_b32 v63, s54, 10 +; GCN-NEXT: v_writelane_b32 v63, s55, 11 +; GCN-NEXT: v_writelane_b32 v63, s64, 12 +; GCN-NEXT: v_writelane_b32 v63, s65, 13 +; GCN-NEXT: v_writelane_b32 v63, s66, 14 +; GCN-NEXT: v_writelane_b32 v63, s67, 15 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 +; GCN-NEXT: s_mov_b32 s36, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s37, s36 +; GCN-NEXT: s_mov_b32 s38, s36 +; GCN-NEXT: s_mov_b32 s39, s36 +; GCN-NEXT: s_mov_b32 s40, s36 +; GCN-NEXT: s_mov_b32 s41, s36 +; GCN-NEXT: s_mov_b32 s42, s36 +; GCN-NEXT: s_mov_b32 s43, s36 +; GCN-NEXT: s_mov_b32 s44, s36 +; GCN-NEXT: s_mov_b32 s45, s36 +; GCN-NEXT: s_mov_b32 s46, s36 +; GCN-NEXT: s_mov_b32 s47, s36 +; GCN-NEXT: s_mov_b32 s48, s36 +; GCN-NEXT: s_mov_b32 s49, s36 +; GCN-NEXT: s_mov_b32 s50, s36 +; GCN-NEXT: s_mov_b32 s51, s36 +; GCN-NEXT: s_mov_b32 s52, s36 +; GCN-NEXT: s_mov_b32 s53, s36 +; GCN-NEXT: s_mov_b32 s54, s36 +; GCN-NEXT: s_mov_b32 s55, s36 +; GCN-NEXT: s_mov_b32 s56, s36 +; GCN-NEXT: s_mov_b32 s57, s36 +; GCN-NEXT: s_mov_b32 s58, s36 +; GCN-NEXT: s_mov_b32 s59, s36 +; GCN-NEXT: s_mov_b32 s60, s36 +; GCN-NEXT: s_mov_b32 s61, s36 +; GCN-NEXT: s_mov_b32 s62, s36 +; GCN-NEXT: s_mov_b32 s63, s36 +; GCN-NEXT: s_mov_b32 s64, s36 +; GCN-NEXT: s_mov_b32 s65, s36 +; GCN-NEXT: s_mov_b32 s66, s36 +; GCN-NEXT: s_mov_b32 s67, s36 +; GCN-NEXT: v_mov_b32_e32 v31, s36 +; GCN-NEXT: v_mov_b32_e32 v32, s37 +; GCN-NEXT: v_mov_b32_e32 v33, s38 +; GCN-NEXT: v_mov_b32_e32 v34, s39 +; GCN-NEXT: v_mov_b32_e32 v35, s40 +; GCN-NEXT: v_mov_b32_e32 v36, s41 +; GCN-NEXT: v_mov_b32_e32 v37, s42 +; GCN-NEXT: v_mov_b32_e32 v38, s43 +; GCN-NEXT: v_mov_b32_e32 v39, s44 +; GCN-NEXT: v_mov_b32_e32 v40, s45 +; GCN-NEXT: v_mov_b32_e32 v41, s46 +; GCN-NEXT: v_mov_b32_e32 v42, s47 +; GCN-NEXT: v_mov_b32_e32 v43, s48 +; GCN-NEXT: v_mov_b32_e32 v44, s49 +; GCN-NEXT: v_mov_b32_e32 v45, s50 +; GCN-NEXT: v_mov_b32_e32 v46, s51 +; GCN-NEXT: v_mov_b32_e32 v47, s52 +; GCN-NEXT: v_mov_b32_e32 v48, s53 +; GCN-NEXT: v_mov_b32_e32 v49, s54 +; GCN-NEXT: v_mov_b32_e32 v50, s55 +; GCN-NEXT: v_mov_b32_e32 v51, s56 +; GCN-NEXT: v_mov_b32_e32 v52, s57 +; GCN-NEXT: v_mov_b32_e32 v53, s58 +; GCN-NEXT: v_mov_b32_e32 v54, s59 +; GCN-NEXT: v_mov_b32_e32 v55, s60 +; GCN-NEXT: v_mov_b32_e32 v56, s61 +; GCN-NEXT: v_mov_b32_e32 v57, s62 +; GCN-NEXT: v_mov_b32_e32 v58, s63 +; GCN-NEXT: v_mov_b32_e32 v59, s64 +; GCN-NEXT: v_mov_b32_e32 v60, s65 +; GCN-NEXT: v_mov_b32_e32 v61, s66 +; GCN-NEXT: v_mov_b32_e32 v62, s67 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB140_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v36, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v37, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v38, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v39, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v40, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v41, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v42, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v43, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v44, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v45, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v46, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v47, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v48, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v27 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v10 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_alignbit_b32 v49, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v50, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v51, v16, v17, 16 +; GCN-NEXT: v_alignbit_b32 v52, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v53, v20, v0, 16 +; GCN-NEXT: v_alignbit_b32 v54, v21, v3, 16 +; GCN-NEXT: v_alignbit_b32 v55, v22, v4, 16 +; GCN-NEXT: v_alignbit_b32 v56, v23, v5, 16 +; GCN-NEXT: v_alignbit_b32 v57, v24, v6, 16 +; GCN-NEXT: v_alignbit_b32 v58, v25, v7, 16 +; GCN-NEXT: v_alignbit_b32 v59, v26, v8, 16 +; GCN-NEXT: v_alignbit_b32 v60, v27, v9, 16 +; GCN-NEXT: v_alignbit_b32 v61, v28, v10, 16 +; GCN-NEXT: v_alignbit_b32 v62, v29, v11, 16 +; GCN-NEXT: .LBB140_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s37, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s36 +; GCN-NEXT: s_mov_b32 s5, s36 +; GCN-NEXT: s_mov_b64 s[6:7], s[36:37] +; GCN-NEXT: buffer_store_dwordx4 v[59:62], v[1:2], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_store_dwordx4 v[55:58], v[1:2], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_store_dwordx4 v[51:54], v[1:2], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_store_dwordx4 v[47:50], v[1:2], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_readlane_b32 s67, v63, 15 +; GCN-NEXT: v_readlane_b32 s66, v63, 14 +; GCN-NEXT: v_readlane_b32 s65, v63, 13 +; GCN-NEXT: v_readlane_b32 s64, v63, 12 +; GCN-NEXT: v_readlane_b32 s55, v63, 11 +; GCN-NEXT: v_readlane_b32 s54, v63, 10 +; GCN-NEXT: v_readlane_b32 s53, v63, 9 +; GCN-NEXT: v_readlane_b32 s52, v63, 8 +; GCN-NEXT: v_readlane_b32 s51, v63, 7 +; GCN-NEXT: v_readlane_b32 s50, v63, 6 +; GCN-NEXT: v_readlane_b32 s49, v63, 5 +; GCN-NEXT: v_readlane_b32 s48, v63, 4 +; GCN-NEXT: v_readlane_b32 s39, v63, 3 +; GCN-NEXT: v_readlane_b32 s38, v63, 2 +; GCN-NEXT: v_readlane_b32 s37, v63, 1 +; GCN-NEXT: v_readlane_b32 s36, v63, 0 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v64bf16_to_v64i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v50, s19 +; VI-NEXT: v_mov_b32_e32 v49, s18 +; VI-NEXT: v_mov_b32_e32 v48, s17 +; VI-NEXT: v_mov_b32_e32 v47, s16 +; VI-NEXT: v_mov_b32_e32 v46, s15 +; VI-NEXT: v_mov_b32_e32 v45, s14 +; VI-NEXT: v_mov_b32_e32 v44, s13 +; VI-NEXT: v_mov_b32_e32 v43, s12 +; VI-NEXT: v_mov_b32_e32 v42, s11 +; VI-NEXT: v_mov_b32_e32 v41, s10 +; VI-NEXT: v_mov_b32_e32 v40, s9 +; VI-NEXT: v_mov_b32_e32 v39, s8 +; VI-NEXT: v_mov_b32_e32 v38, s7 +; VI-NEXT: v_mov_b32_e32 v37, s6 +; VI-NEXT: v_mov_b32_e32 v36, s5 +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB140_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v50, v18 +; VI-NEXT: v_mov_b32_e32 v49, v17 +; VI-NEXT: v_mov_b32_e32 v48, v16 +; VI-NEXT: v_mov_b32_e32 v47, v15 +; VI-NEXT: v_mov_b32_e32 v46, v14 +; VI-NEXT: v_mov_b32_e32 v45, v13 +; VI-NEXT: v_mov_b32_e32 v44, v12 +; VI-NEXT: v_mov_b32_e32 v43, v11 +; VI-NEXT: v_mov_b32_e32 v42, v10 +; VI-NEXT: v_mov_b32_e32 v41, v9 +; VI-NEXT: v_mov_b32_e32 v40, v8 +; VI-NEXT: v_mov_b32_e32 v39, v7 +; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v36, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: .LBB140_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38] +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: s_movk_i32 s4, 0x70 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x60 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x50 +; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] +; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v64bf16_to_v64i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v50, s19 +; GFX9-NEXT: v_mov_b32_e32 v49, s18 +; GFX9-NEXT: v_mov_b32_e32 v48, s17 +; GFX9-NEXT: v_mov_b32_e32 v47, s16 +; GFX9-NEXT: v_mov_b32_e32 v46, s15 +; GFX9-NEXT: v_mov_b32_e32 v45, s14 +; GFX9-NEXT: v_mov_b32_e32 v44, s13 +; GFX9-NEXT: v_mov_b32_e32 v43, s12 +; GFX9-NEXT: v_mov_b32_e32 v42, s11 +; GFX9-NEXT: v_mov_b32_e32 v41, s10 +; GFX9-NEXT: v_mov_b32_e32 v40, s9 +; GFX9-NEXT: v_mov_b32_e32 v39, s8 +; GFX9-NEXT: v_mov_b32_e32 v38, s7 +; GFX9-NEXT: v_mov_b32_e32 v37, s6 +; GFX9-NEXT: v_mov_b32_e32 v36, s5 +; GFX9-NEXT: v_mov_b32_e32 v35, s4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB140_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v50, v18 +; GFX9-NEXT: v_mov_b32_e32 v49, v17 +; GFX9-NEXT: v_mov_b32_e32 v48, v16 +; GFX9-NEXT: v_mov_b32_e32 v47, v15 +; GFX9-NEXT: v_mov_b32_e32 v46, v14 +; GFX9-NEXT: v_mov_b32_e32 v45, v13 +; GFX9-NEXT: v_mov_b32_e32 v44, v12 +; GFX9-NEXT: v_mov_b32_e32 v43, v11 +; GFX9-NEXT: v_mov_b32_e32 v42, v10 +; GFX9-NEXT: v_mov_b32_e32 v41, v9 +; GFX9-NEXT: v_mov_b32_e32 v40, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v7 +; GFX9-NEXT: v_mov_b32_e32 v38, v6 +; GFX9-NEXT: v_mov_b32_e32 v37, v5 +; GFX9-NEXT: v_mov_b32_e32 v36, v4 +; GFX9-NEXT: v_mov_b32_e32 v35, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB140_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v64bf16_to_v64i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14 +; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0 +; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12 +; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10 +; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8 +; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6 +; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4 +; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2 +; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56 +; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54 +; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58 +; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60 +; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62 +; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64 +; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB140_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 +; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15 +; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13 +; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11 +; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9 +; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7 +; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5 +; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3 +; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 +; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29 +; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27 +; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25 +; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23 +; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21 +; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19 +; GFX11-NEXT: .LBB140_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off +; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112 +; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96 +; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80 +; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -2049,8 +31554,1429 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v64f16: + define void @v_bitcast_v64bf16_to_v64f16(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v64bf16_to_v64f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v57, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v58, 0 +; GCN-NEXT: v_mov_b32_e32 v56, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: v_mov_b32_e32 v45, 0 +; GCN-NEXT: v_mov_b32_e32 v47, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 +; GCN-NEXT: v_mov_b32_e32 v46, 0 +; GCN-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NEXT: v_mov_b32_e32 v37, 0 +; GCN-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v38, 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB141_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v61 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v23 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v33 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v28 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v32 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v24 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v31 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v29 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v63 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v60 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v62 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v59 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v61 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v59 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v60 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v61 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v62 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v63 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v52 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v53 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v54 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v55 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v40 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v41 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v43 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v44 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v45 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v46 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v47 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v56 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v57 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v58 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v57, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v58, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v47, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v28 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v31 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v35 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v59 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v63 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: .LBB141_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v57 +; GCN-NEXT: v_or_b32_e32 v3, v3, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v56 +; GCN-NEXT: v_or_b32_e32 v4, v4, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v45 +; GCN-NEXT: v_or_b32_e32 v5, v5, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v44 +; GCN-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v41 +; GCN-NEXT: v_or_b32_e32 v3, v3, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v40 +; GCN-NEXT: v_or_b32_e32 v4, v4, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v54 +; GCN-NEXT: v_or_b32_e32 v5, v5, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v52 +; GCN-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v49 +; GCN-NEXT: v_or_b32_e32 v3, v3, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v48 +; GCN-NEXT: v_or_b32_e32 v4, v4, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v37 +; GCN-NEXT: v_or_b32_e32 v5, v5, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v36 +; GCN-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_or_b32_e32 v4, v4, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_or_b32_e32 v5, v5, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_or_b32_e32 v4, v4, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_or_b32_e32 v5, v5, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_or_b32_e32 v4, v4, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_or_b32_e32 v5, v5, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_or_b32_e32 v4, v4, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_or_b32_e32 v5, v5, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_or_b32_e32 v4, v4, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v22 +; GCN-NEXT: v_or_b32_e32 v5, v5, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v64bf16_to_v64f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s19, s4 +; VI-NEXT: s_mov_b32 s5, s4 +; VI-NEXT: s_mov_b32 s6, s4 +; VI-NEXT: s_mov_b32 s7, s4 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s4 +; VI-NEXT: s_mov_b32 s10, s4 +; VI-NEXT: s_mov_b32 s11, s4 +; VI-NEXT: s_mov_b32 s12, s4 +; VI-NEXT: s_mov_b32 s13, s4 +; VI-NEXT: s_mov_b32 s14, s4 +; VI-NEXT: s_mov_b32 s15, s4 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s4 +; VI-NEXT: s_mov_b32 s18, s4 +; VI-NEXT: v_mov_b32_e32 v50, s19 +; VI-NEXT: v_mov_b32_e32 v49, s18 +; VI-NEXT: v_mov_b32_e32 v48, s17 +; VI-NEXT: v_mov_b32_e32 v47, s16 +; VI-NEXT: v_mov_b32_e32 v46, s15 +; VI-NEXT: v_mov_b32_e32 v45, s14 +; VI-NEXT: v_mov_b32_e32 v44, s13 +; VI-NEXT: v_mov_b32_e32 v43, s12 +; VI-NEXT: v_mov_b32_e32 v42, s11 +; VI-NEXT: v_mov_b32_e32 v41, s10 +; VI-NEXT: v_mov_b32_e32 v40, s9 +; VI-NEXT: v_mov_b32_e32 v39, s8 +; VI-NEXT: v_mov_b32_e32 v38, s7 +; VI-NEXT: v_mov_b32_e32 v37, s6 +; VI-NEXT: v_mov_b32_e32 v36, s5 +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB141_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: v_mov_b32_e32 v50, v18 +; VI-NEXT: v_mov_b32_e32 v49, v17 +; VI-NEXT: v_mov_b32_e32 v48, v16 +; VI-NEXT: v_mov_b32_e32 v47, v15 +; VI-NEXT: v_mov_b32_e32 v46, v14 +; VI-NEXT: v_mov_b32_e32 v45, v13 +; VI-NEXT: v_mov_b32_e32 v44, v12 +; VI-NEXT: v_mov_b32_e32 v43, v11 +; VI-NEXT: v_mov_b32_e32 v42, v10 +; VI-NEXT: v_mov_b32_e32 v41, v9 +; VI-NEXT: v_mov_b32_e32 v40, v8 +; VI-NEXT: v_mov_b32_e32 v39, v7 +; VI-NEXT: v_mov_b32_e32 v38, v6 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v36, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: .LBB141_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38] +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: s_movk_i32 s4, 0x70 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x60 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x50 +; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] +; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v64bf16_to_v64f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s19, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_mov_b32 s6, s4 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_mov_b32 s9, s4 +; GFX9-NEXT: s_mov_b32 s10, s4 +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: s_mov_b32 s12, s4 +; GFX9-NEXT: s_mov_b32 s13, s4 +; GFX9-NEXT: s_mov_b32 s14, s4 +; GFX9-NEXT: s_mov_b32 s15, s4 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s4 +; GFX9-NEXT: s_mov_b32 s18, s4 +; GFX9-NEXT: v_mov_b32_e32 v50, s19 +; GFX9-NEXT: v_mov_b32_e32 v49, s18 +; GFX9-NEXT: v_mov_b32_e32 v48, s17 +; GFX9-NEXT: v_mov_b32_e32 v47, s16 +; GFX9-NEXT: v_mov_b32_e32 v46, s15 +; GFX9-NEXT: v_mov_b32_e32 v45, s14 +; GFX9-NEXT: v_mov_b32_e32 v44, s13 +; GFX9-NEXT: v_mov_b32_e32 v43, s12 +; GFX9-NEXT: v_mov_b32_e32 v42, s11 +; GFX9-NEXT: v_mov_b32_e32 v41, s10 +; GFX9-NEXT: v_mov_b32_e32 v40, s9 +; GFX9-NEXT: v_mov_b32_e32 v39, s8 +; GFX9-NEXT: v_mov_b32_e32 v38, s7 +; GFX9-NEXT: v_mov_b32_e32 v37, s6 +; GFX9-NEXT: v_mov_b32_e32 v36, s5 +; GFX9-NEXT: v_mov_b32_e32 v35, s4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB141_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: v_mov_b32_e32 v50, v18 +; GFX9-NEXT: v_mov_b32_e32 v49, v17 +; GFX9-NEXT: v_mov_b32_e32 v48, v16 +; GFX9-NEXT: v_mov_b32_e32 v47, v15 +; GFX9-NEXT: v_mov_b32_e32 v46, v14 +; GFX9-NEXT: v_mov_b32_e32 v45, v13 +; GFX9-NEXT: v_mov_b32_e32 v44, v12 +; GFX9-NEXT: v_mov_b32_e32 v43, v11 +; GFX9-NEXT: v_mov_b32_e32 v42, v10 +; GFX9-NEXT: v_mov_b32_e32 v41, v9 +; GFX9-NEXT: v_mov_b32_e32 v40, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v7 +; GFX9-NEXT: v_mov_b32_e32 v38, v6 +; GFX9-NEXT: v_mov_b32_e32 v37, v5 +; GFX9-NEXT: v_mov_b32_e32 v36, v4 +; GFX9-NEXT: v_mov_b32_e32 v35, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB141_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v64bf16_to_v64f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14 +; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0 +; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12 +; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10 +; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8 +; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6 +; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4 +; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2 +; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56 +; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54 +; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58 +; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60 +; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62 +; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64 +; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB141_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 +; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15 +; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13 +; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11 +; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9 +; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7 +; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5 +; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3 +; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 +; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29 +; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27 +; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25 +; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23 +; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21 +; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19 +; GFX11-NEXT: .LBB141_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off +; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112 +; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96 +; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80 +; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -2065,8 +32991,1333 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v128i8: + define void @v_bitcast_v64bf16_to_v128i8(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v64bf16_to_v128i8: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v63, s36, 0 +; GCN-NEXT: v_writelane_b32 v63, s37, 1 +; GCN-NEXT: v_writelane_b32 v63, s38, 2 +; GCN-NEXT: v_writelane_b32 v63, s39, 3 +; GCN-NEXT: v_writelane_b32 v63, s48, 4 +; GCN-NEXT: v_writelane_b32 v63, s49, 5 +; GCN-NEXT: v_writelane_b32 v63, s50, 6 +; GCN-NEXT: v_writelane_b32 v63, s51, 7 +; GCN-NEXT: v_writelane_b32 v63, s52, 8 +; GCN-NEXT: v_writelane_b32 v63, s53, 9 +; GCN-NEXT: v_writelane_b32 v63, s54, 10 +; GCN-NEXT: v_writelane_b32 v63, s55, 11 +; GCN-NEXT: v_writelane_b32 v63, s64, 12 +; GCN-NEXT: v_writelane_b32 v63, s65, 13 +; GCN-NEXT: v_writelane_b32 v63, s66, 14 +; GCN-NEXT: v_writelane_b32 v63, s67, 15 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 +; GCN-NEXT: s_mov_b32 s36, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s37, s36 +; GCN-NEXT: s_mov_b32 s38, s36 +; GCN-NEXT: s_mov_b32 s39, s36 +; GCN-NEXT: s_mov_b32 s40, s36 +; GCN-NEXT: s_mov_b32 s41, s36 +; GCN-NEXT: s_mov_b32 s42, s36 +; GCN-NEXT: s_mov_b32 s43, s36 +; GCN-NEXT: s_mov_b32 s44, s36 +; GCN-NEXT: s_mov_b32 s45, s36 +; GCN-NEXT: s_mov_b32 s46, s36 +; GCN-NEXT: s_mov_b32 s47, s36 +; GCN-NEXT: s_mov_b32 s48, s36 +; GCN-NEXT: s_mov_b32 s49, s36 +; GCN-NEXT: s_mov_b32 s50, s36 +; GCN-NEXT: s_mov_b32 s51, s36 +; GCN-NEXT: s_mov_b32 s52, s36 +; GCN-NEXT: s_mov_b32 s53, s36 +; GCN-NEXT: s_mov_b32 s54, s36 +; GCN-NEXT: s_mov_b32 s55, s36 +; GCN-NEXT: s_mov_b32 s56, s36 +; GCN-NEXT: s_mov_b32 s57, s36 +; GCN-NEXT: s_mov_b32 s58, s36 +; GCN-NEXT: s_mov_b32 s59, s36 +; GCN-NEXT: s_mov_b32 s60, s36 +; GCN-NEXT: s_mov_b32 s61, s36 +; GCN-NEXT: s_mov_b32 s62, s36 +; GCN-NEXT: s_mov_b32 s63, s36 +; GCN-NEXT: s_mov_b32 s64, s36 +; GCN-NEXT: s_mov_b32 s65, s36 +; GCN-NEXT: s_mov_b32 s66, s36 +; GCN-NEXT: s_mov_b32 s67, s36 +; GCN-NEXT: v_mov_b32_e32 v31, s36 +; GCN-NEXT: v_mov_b32_e32 v32, s37 +; GCN-NEXT: v_mov_b32_e32 v33, s38 +; GCN-NEXT: v_mov_b32_e32 v34, s39 +; GCN-NEXT: v_mov_b32_e32 v35, s40 +; GCN-NEXT: v_mov_b32_e32 v36, s41 +; GCN-NEXT: v_mov_b32_e32 v37, s42 +; GCN-NEXT: v_mov_b32_e32 v38, s43 +; GCN-NEXT: v_mov_b32_e32 v39, s44 +; GCN-NEXT: v_mov_b32_e32 v40, s45 +; GCN-NEXT: v_mov_b32_e32 v41, s46 +; GCN-NEXT: v_mov_b32_e32 v42, s47 +; GCN-NEXT: v_mov_b32_e32 v43, s48 +; GCN-NEXT: v_mov_b32_e32 v44, s49 +; GCN-NEXT: v_mov_b32_e32 v45, s50 +; GCN-NEXT: v_mov_b32_e32 v46, s51 +; GCN-NEXT: v_mov_b32_e32 v47, s52 +; GCN-NEXT: v_mov_b32_e32 v48, s53 +; GCN-NEXT: v_mov_b32_e32 v49, s54 +; GCN-NEXT: v_mov_b32_e32 v50, s55 +; GCN-NEXT: v_mov_b32_e32 v51, s56 +; GCN-NEXT: v_mov_b32_e32 v52, s57 +; GCN-NEXT: v_mov_b32_e32 v53, s58 +; GCN-NEXT: v_mov_b32_e32 v54, s59 +; GCN-NEXT: v_mov_b32_e32 v55, s60 +; GCN-NEXT: v_mov_b32_e32 v56, s61 +; GCN-NEXT: v_mov_b32_e32 v57, s62 +; GCN-NEXT: v_mov_b32_e32 v58, s63 +; GCN-NEXT: v_mov_b32_e32 v59, s64 +; GCN-NEXT: v_mov_b32_e32 v60, s65 +; GCN-NEXT: v_mov_b32_e32 v61, s66 +; GCN-NEXT: v_mov_b32_e32 v62, s67 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB142_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v36, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v37, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v38, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v39, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v40, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v41, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v42, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v43, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v44, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v45, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v46, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v47, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v48, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v27 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v10 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_alignbit_b32 v49, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v50, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v51, v16, v17, 16 +; GCN-NEXT: v_alignbit_b32 v52, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v53, v20, v0, 16 +; GCN-NEXT: v_alignbit_b32 v54, v21, v3, 16 +; GCN-NEXT: v_alignbit_b32 v55, v22, v4, 16 +; GCN-NEXT: v_alignbit_b32 v56, v23, v5, 16 +; GCN-NEXT: v_alignbit_b32 v57, v24, v6, 16 +; GCN-NEXT: v_alignbit_b32 v58, v25, v7, 16 +; GCN-NEXT: v_alignbit_b32 v59, v26, v8, 16 +; GCN-NEXT: v_alignbit_b32 v60, v27, v9, 16 +; GCN-NEXT: v_alignbit_b32 v61, v28, v10, 16 +; GCN-NEXT: v_alignbit_b32 v62, v29, v11, 16 +; GCN-NEXT: .LBB142_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s37, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s36 +; GCN-NEXT: s_mov_b32 s5, s36 +; GCN-NEXT: s_mov_b64 s[6:7], s[36:37] +; GCN-NEXT: buffer_store_dwordx4 v[59:62], v[1:2], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_store_dwordx4 v[55:58], v[1:2], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_store_dwordx4 v[51:54], v[1:2], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_store_dwordx4 v[47:50], v[1:2], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_readlane_b32 s67, v63, 15 +; GCN-NEXT: v_readlane_b32 s66, v63, 14 +; GCN-NEXT: v_readlane_b32 s65, v63, 13 +; GCN-NEXT: v_readlane_b32 s64, v63, 12 +; GCN-NEXT: v_readlane_b32 s55, v63, 11 +; GCN-NEXT: v_readlane_b32 s54, v63, 10 +; GCN-NEXT: v_readlane_b32 s53, v63, 9 +; GCN-NEXT: v_readlane_b32 s52, v63, 8 +; GCN-NEXT: v_readlane_b32 s51, v63, 7 +; GCN-NEXT: v_readlane_b32 s50, v63, 6 +; GCN-NEXT: v_readlane_b32 s49, v63, 5 +; GCN-NEXT: v_readlane_b32 s48, v63, 4 +; GCN-NEXT: v_readlane_b32 s39, v63, 3 +; GCN-NEXT: v_readlane_b32 s38, v63, 2 +; GCN-NEXT: v_readlane_b32 s37, v63, 1 +; GCN-NEXT: v_readlane_b32 s36, v63, 0 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v64bf16_to_v128i8: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: v_writelane_b32 v63, s36, 0 +; VI-NEXT: v_writelane_b32 v63, s37, 1 +; VI-NEXT: v_writelane_b32 v63, s38, 2 +; VI-NEXT: v_writelane_b32 v63, s39, 3 +; VI-NEXT: v_writelane_b32 v63, s48, 4 +; VI-NEXT: v_writelane_b32 v63, s49, 5 +; VI-NEXT: v_writelane_b32 v63, s50, 6 +; VI-NEXT: v_writelane_b32 v63, s51, 7 +; VI-NEXT: v_writelane_b32 v63, s52, 8 +; VI-NEXT: v_writelane_b32 v63, s53, 9 +; VI-NEXT: v_writelane_b32 v63, s54, 10 +; VI-NEXT: v_writelane_b32 v63, s55, 11 +; VI-NEXT: v_writelane_b32 v63, s64, 12 +; VI-NEXT: v_writelane_b32 v63, s65, 13 +; VI-NEXT: v_writelane_b32 v63, s66, 14 +; VI-NEXT: v_writelane_b32 v63, s67, 15 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s36, 0 +; VI-NEXT: s_mov_b32 s37, s36 +; VI-NEXT: s_mov_b32 s38, s36 +; VI-NEXT: s_mov_b32 s39, s36 +; VI-NEXT: s_mov_b32 s40, s36 +; VI-NEXT: s_mov_b32 s41, s36 +; VI-NEXT: s_mov_b32 s42, s36 +; VI-NEXT: s_mov_b32 s43, s36 +; VI-NEXT: s_mov_b32 s44, s36 +; VI-NEXT: s_mov_b32 s45, s36 +; VI-NEXT: s_mov_b32 s46, s36 +; VI-NEXT: s_mov_b32 s47, s36 +; VI-NEXT: s_mov_b32 s48, s36 +; VI-NEXT: s_mov_b32 s49, s36 +; VI-NEXT: s_mov_b32 s50, s36 +; VI-NEXT: s_mov_b32 s51, s36 +; VI-NEXT: s_mov_b32 s52, s36 +; VI-NEXT: s_mov_b32 s53, s36 +; VI-NEXT: s_mov_b32 s54, s36 +; VI-NEXT: s_mov_b32 s55, s36 +; VI-NEXT: s_mov_b32 s56, s36 +; VI-NEXT: s_mov_b32 s57, s36 +; VI-NEXT: s_mov_b32 s58, s36 +; VI-NEXT: s_mov_b32 s59, s36 +; VI-NEXT: s_mov_b32 s60, s36 +; VI-NEXT: s_mov_b32 s61, s36 +; VI-NEXT: s_mov_b32 s62, s36 +; VI-NEXT: s_mov_b32 s63, s36 +; VI-NEXT: s_mov_b32 s64, s36 +; VI-NEXT: s_mov_b32 s65, s36 +; VI-NEXT: s_mov_b32 s66, s36 +; VI-NEXT: s_mov_b32 s67, s36 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v31, s36 +; VI-NEXT: v_mov_b32_e32 v32, s37 +; VI-NEXT: v_mov_b32_e32 v33, s38 +; VI-NEXT: v_mov_b32_e32 v34, s39 +; VI-NEXT: v_mov_b32_e32 v35, s40 +; VI-NEXT: v_mov_b32_e32 v36, s41 +; VI-NEXT: v_mov_b32_e32 v37, s42 +; VI-NEXT: v_mov_b32_e32 v38, s43 +; VI-NEXT: v_mov_b32_e32 v39, s44 +; VI-NEXT: v_mov_b32_e32 v40, s45 +; VI-NEXT: v_mov_b32_e32 v41, s46 +; VI-NEXT: v_mov_b32_e32 v42, s47 +; VI-NEXT: v_mov_b32_e32 v43, s48 +; VI-NEXT: v_mov_b32_e32 v44, s49 +; VI-NEXT: v_mov_b32_e32 v45, s50 +; VI-NEXT: v_mov_b32_e32 v46, s51 +; VI-NEXT: v_mov_b32_e32 v47, s52 +; VI-NEXT: v_mov_b32_e32 v48, s53 +; VI-NEXT: v_mov_b32_e32 v49, s54 +; VI-NEXT: v_mov_b32_e32 v50, s55 +; VI-NEXT: v_mov_b32_e32 v51, s56 +; VI-NEXT: v_mov_b32_e32 v52, s57 +; VI-NEXT: v_mov_b32_e32 v53, s58 +; VI-NEXT: v_mov_b32_e32 v54, s59 +; VI-NEXT: v_mov_b32_e32 v55, s60 +; VI-NEXT: v_mov_b32_e32 v56, s61 +; VI-NEXT: v_mov_b32_e32 v57, s62 +; VI-NEXT: v_mov_b32_e32 v58, s63 +; VI-NEXT: v_mov_b32_e32 v59, s64 +; VI-NEXT: v_mov_b32_e32 v60, s65 +; VI-NEXT: v_mov_b32_e32 v61, s66 +; VI-NEXT: v_mov_b32_e32 v62, s67 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB142_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: .LBB142_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_movk_i32 s4, 0x70 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x60 +; VI-NEXT: v_readlane_b32 s67, v63, 15 +; VI-NEXT: v_readlane_b32 s66, v63, 14 +; VI-NEXT: v_readlane_b32 s65, v63, 13 +; VI-NEXT: v_readlane_b32 s64, v63, 12 +; VI-NEXT: v_readlane_b32 s55, v63, 11 +; VI-NEXT: v_readlane_b32 s54, v63, 10 +; VI-NEXT: v_readlane_b32 s53, v63, 9 +; VI-NEXT: v_readlane_b32 s52, v63, 8 +; VI-NEXT: v_readlane_b32 s51, v63, 7 +; VI-NEXT: v_readlane_b32 s50, v63, 6 +; VI-NEXT: v_readlane_b32 s49, v63, 5 +; VI-NEXT: v_readlane_b32 s48, v63, 4 +; VI-NEXT: v_readlane_b32 s39, v63, 3 +; VI-NEXT: v_readlane_b32 s38, v63, 2 +; VI-NEXT: v_readlane_b32 s37, v63, 1 +; VI-NEXT: v_readlane_b32 s36, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dwordx4 v[3:4], v[33:36] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: s_movk_i32 s4, 0x50 +; VI-NEXT: flat_store_dwordx4 v[3:4], v[29:32] +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[25:28] +; VI-NEXT: v_add_u32_e32 v3, vcc, 64, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[21:24] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[5:8] +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v64bf16_to_v128i8: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v63, s36, 0 +; GFX9-NEXT: v_writelane_b32 v63, s37, 1 +; GFX9-NEXT: v_writelane_b32 v63, s38, 2 +; GFX9-NEXT: v_writelane_b32 v63, s39, 3 +; GFX9-NEXT: v_writelane_b32 v63, s48, 4 +; GFX9-NEXT: v_writelane_b32 v63, s49, 5 +; GFX9-NEXT: v_writelane_b32 v63, s50, 6 +; GFX9-NEXT: v_writelane_b32 v63, s51, 7 +; GFX9-NEXT: v_writelane_b32 v63, s52, 8 +; GFX9-NEXT: v_writelane_b32 v63, s53, 9 +; GFX9-NEXT: v_writelane_b32 v63, s54, 10 +; GFX9-NEXT: v_writelane_b32 v63, s55, 11 +; GFX9-NEXT: v_writelane_b32 v63, s64, 12 +; GFX9-NEXT: v_writelane_b32 v63, s65, 13 +; GFX9-NEXT: v_writelane_b32 v63, s66, 14 +; GFX9-NEXT: v_writelane_b32 v63, s67, 15 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s36, 0 +; GFX9-NEXT: s_mov_b32 s37, s36 +; GFX9-NEXT: s_mov_b32 s38, s36 +; GFX9-NEXT: s_mov_b32 s39, s36 +; GFX9-NEXT: s_mov_b32 s40, s36 +; GFX9-NEXT: s_mov_b32 s41, s36 +; GFX9-NEXT: s_mov_b32 s42, s36 +; GFX9-NEXT: s_mov_b32 s43, s36 +; GFX9-NEXT: s_mov_b32 s44, s36 +; GFX9-NEXT: s_mov_b32 s45, s36 +; GFX9-NEXT: s_mov_b32 s46, s36 +; GFX9-NEXT: s_mov_b32 s47, s36 +; GFX9-NEXT: s_mov_b32 s48, s36 +; GFX9-NEXT: s_mov_b32 s49, s36 +; GFX9-NEXT: s_mov_b32 s50, s36 +; GFX9-NEXT: s_mov_b32 s51, s36 +; GFX9-NEXT: s_mov_b32 s52, s36 +; GFX9-NEXT: s_mov_b32 s53, s36 +; GFX9-NEXT: s_mov_b32 s54, s36 +; GFX9-NEXT: s_mov_b32 s55, s36 +; GFX9-NEXT: s_mov_b32 s56, s36 +; GFX9-NEXT: s_mov_b32 s57, s36 +; GFX9-NEXT: s_mov_b32 s58, s36 +; GFX9-NEXT: s_mov_b32 s59, s36 +; GFX9-NEXT: s_mov_b32 s60, s36 +; GFX9-NEXT: s_mov_b32 s61, s36 +; GFX9-NEXT: s_mov_b32 s62, s36 +; GFX9-NEXT: s_mov_b32 s63, s36 +; GFX9-NEXT: s_mov_b32 s64, s36 +; GFX9-NEXT: s_mov_b32 s65, s36 +; GFX9-NEXT: s_mov_b32 s66, s36 +; GFX9-NEXT: s_mov_b32 s67, s36 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v31, s36 +; GFX9-NEXT: v_mov_b32_e32 v32, s37 +; GFX9-NEXT: v_mov_b32_e32 v33, s38 +; GFX9-NEXT: v_mov_b32_e32 v34, s39 +; GFX9-NEXT: v_mov_b32_e32 v35, s40 +; GFX9-NEXT: v_mov_b32_e32 v36, s41 +; GFX9-NEXT: v_mov_b32_e32 v37, s42 +; GFX9-NEXT: v_mov_b32_e32 v38, s43 +; GFX9-NEXT: v_mov_b32_e32 v39, s44 +; GFX9-NEXT: v_mov_b32_e32 v40, s45 +; GFX9-NEXT: v_mov_b32_e32 v41, s46 +; GFX9-NEXT: v_mov_b32_e32 v42, s47 +; GFX9-NEXT: v_mov_b32_e32 v43, s48 +; GFX9-NEXT: v_mov_b32_e32 v44, s49 +; GFX9-NEXT: v_mov_b32_e32 v45, s50 +; GFX9-NEXT: v_mov_b32_e32 v46, s51 +; GFX9-NEXT: v_mov_b32_e32 v47, s52 +; GFX9-NEXT: v_mov_b32_e32 v48, s53 +; GFX9-NEXT: v_mov_b32_e32 v49, s54 +; GFX9-NEXT: v_mov_b32_e32 v50, s55 +; GFX9-NEXT: v_mov_b32_e32 v51, s56 +; GFX9-NEXT: v_mov_b32_e32 v52, s57 +; GFX9-NEXT: v_mov_b32_e32 v53, s58 +; GFX9-NEXT: v_mov_b32_e32 v54, s59 +; GFX9-NEXT: v_mov_b32_e32 v55, s60 +; GFX9-NEXT: v_mov_b32_e32 v56, s61 +; GFX9-NEXT: v_mov_b32_e32 v57, s62 +; GFX9-NEXT: v_mov_b32_e32 v58, s63 +; GFX9-NEXT: v_mov_b32_e32 v59, s64 +; GFX9-NEXT: v_mov_b32_e32 v60, s65 +; GFX9-NEXT: v_mov_b32_e32 v61, s66 +; GFX9-NEXT: v_mov_b32_e32 v62, s67 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB142_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB142_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s67, v63, 15 +; GFX9-NEXT: v_readlane_b32 s66, v63, 14 +; GFX9-NEXT: v_readlane_b32 s65, v63, 13 +; GFX9-NEXT: v_readlane_b32 s64, v63, 12 +; GFX9-NEXT: v_readlane_b32 s55, v63, 11 +; GFX9-NEXT: v_readlane_b32 s54, v63, 10 +; GFX9-NEXT: v_readlane_b32 s53, v63, 9 +; GFX9-NEXT: v_readlane_b32 s52, v63, 8 +; GFX9-NEXT: v_readlane_b32 s51, v63, 7 +; GFX9-NEXT: v_readlane_b32 s50, v63, 6 +; GFX9-NEXT: v_readlane_b32 s49, v63, 5 +; GFX9-NEXT: v_readlane_b32 s48, v63, 4 +; GFX9-NEXT: v_readlane_b32 s39, v63, 3 +; GFX9-NEXT: v_readlane_b32 s38, v63, 2 +; GFX9-NEXT: v_readlane_b32 s37, v63, 1 +; GFX9-NEXT: v_readlane_b32 s36, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:96 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:80 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off offset:64 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v64bf16_to_v128i8: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v67, s32 offset:80 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 +; GFX11-NEXT: v_writelane_b32 v67, s30, 0 +; GFX11-NEXT: v_writelane_b32 v67, s31, 1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s31, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s16, s0 +; GFX11-NEXT: s_mov_b32 s17, s0 +; GFX11-NEXT: s_mov_b32 s18, s0 +; GFX11-NEXT: s_mov_b32 s19, s0 +; GFX11-NEXT: s_mov_b32 s20, s0 +; GFX11-NEXT: s_mov_b32 s21, s0 +; GFX11-NEXT: s_mov_b32 s22, s0 +; GFX11-NEXT: s_mov_b32 s23, s0 +; GFX11-NEXT: s_mov_b32 s24, s0 +; GFX11-NEXT: s_mov_b32 s25, s0 +; GFX11-NEXT: s_mov_b32 s26, s0 +; GFX11-NEXT: s_mov_b32 s27, s0 +; GFX11-NEXT: s_mov_b32 s28, s0 +; GFX11-NEXT: s_mov_b32 s29, s0 +; GFX11-NEXT: s_mov_b32 s30, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v66, s31 :: v_dual_mov_b32 v65, s30 +; GFX11-NEXT: v_dual_mov_b32 v64, s29 :: v_dual_mov_b32 v63, s28 +; GFX11-NEXT: v_dual_mov_b32 v62, s27 :: v_dual_mov_b32 v61, s26 +; GFX11-NEXT: v_dual_mov_b32 v60, s25 :: v_dual_mov_b32 v59, s24 +; GFX11-NEXT: v_dual_mov_b32 v58, s23 :: v_dual_mov_b32 v57, s22 +; GFX11-NEXT: v_dual_mov_b32 v56, s21 :: v_dual_mov_b32 v55, s20 +; GFX11-NEXT: v_dual_mov_b32 v54, s19 :: v_dual_mov_b32 v53, s18 +; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v51, s16 +; GFX11-NEXT: v_dual_mov_b32 v50, s15 :: v_dual_mov_b32 v49, s14 +; GFX11-NEXT: v_dual_mov_b32 v48, s13 :: v_dual_mov_b32 v47, s12 +; GFX11-NEXT: v_dual_mov_b32 v46, s11 :: v_dual_mov_b32 v45, s10 +; GFX11-NEXT: v_dual_mov_b32 v44, s9 :: v_dual_mov_b32 v43, s8 +; GFX11-NEXT: v_dual_mov_b32 v42, s7 :: v_dual_mov_b32 v41, s6 +; GFX11-NEXT: v_dual_mov_b32 v40, s5 :: v_dual_mov_b32 v39, s4 +; GFX11-NEXT: v_dual_mov_b32 v38, s3 :: v_dual_mov_b32 v37, s2 +; GFX11-NEXT: v_dual_mov_b32 v36, s1 :: v_dual_mov_b32 v35, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB142_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33 +; GFX11-NEXT: v_dual_mov_b32 v64, v32 :: v_dual_mov_b32 v63, v31 +; GFX11-NEXT: v_dual_mov_b32 v62, v30 :: v_dual_mov_b32 v61, v29 +; GFX11-NEXT: v_dual_mov_b32 v60, v28 :: v_dual_mov_b32 v59, v27 +; GFX11-NEXT: v_dual_mov_b32 v58, v26 :: v_dual_mov_b32 v57, v25 +; GFX11-NEXT: v_dual_mov_b32 v56, v24 :: v_dual_mov_b32 v55, v23 +; GFX11-NEXT: v_dual_mov_b32 v54, v22 :: v_dual_mov_b32 v53, v21 +; GFX11-NEXT: v_dual_mov_b32 v52, v20 :: v_dual_mov_b32 v51, v19 +; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v49, v17 +; GFX11-NEXT: v_dual_mov_b32 v48, v16 :: v_dual_mov_b32 v47, v15 +; GFX11-NEXT: v_dual_mov_b32 v46, v14 :: v_dual_mov_b32 v45, v13 +; GFX11-NEXT: v_dual_mov_b32 v44, v12 :: v_dual_mov_b32 v43, v11 +; GFX11-NEXT: v_dual_mov_b32 v42, v10 :: v_dual_mov_b32 v41, v9 +; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7 +; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5 +; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3 +; GFX11-NEXT: .LBB142_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112 +; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:96 +; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:80 +; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:64 +; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 +; GFX11-NEXT: v_readlane_b32 s30, v67, 0 +; GFX11-NEXT: v_readlane_b32 s31, v67, 1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:80 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -2081,8 +34332,1330 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v16i64: + define void @v_bitcast_v64bf16_to_v16i64(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v64bf16_to_v16i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v63, s36, 0 +; GCN-NEXT: v_writelane_b32 v63, s37, 1 +; GCN-NEXT: v_writelane_b32 v63, s38, 2 +; GCN-NEXT: v_writelane_b32 v63, s39, 3 +; GCN-NEXT: v_writelane_b32 v63, s48, 4 +; GCN-NEXT: v_writelane_b32 v63, s49, 5 +; GCN-NEXT: v_writelane_b32 v63, s50, 6 +; GCN-NEXT: v_writelane_b32 v63, s51, 7 +; GCN-NEXT: v_writelane_b32 v63, s52, 8 +; GCN-NEXT: v_writelane_b32 v63, s53, 9 +; GCN-NEXT: v_writelane_b32 v63, s54, 10 +; GCN-NEXT: v_writelane_b32 v63, s55, 11 +; GCN-NEXT: v_writelane_b32 v63, s64, 12 +; GCN-NEXT: v_writelane_b32 v63, s65, 13 +; GCN-NEXT: v_writelane_b32 v63, s66, 14 +; GCN-NEXT: v_writelane_b32 v63, s67, 15 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 +; GCN-NEXT: s_mov_b32 s36, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s37, s36 +; GCN-NEXT: s_mov_b32 s38, s36 +; GCN-NEXT: s_mov_b32 s39, s36 +; GCN-NEXT: s_mov_b32 s40, s36 +; GCN-NEXT: s_mov_b32 s41, s36 +; GCN-NEXT: s_mov_b32 s42, s36 +; GCN-NEXT: s_mov_b32 s43, s36 +; GCN-NEXT: s_mov_b32 s44, s36 +; GCN-NEXT: s_mov_b32 s45, s36 +; GCN-NEXT: s_mov_b32 s46, s36 +; GCN-NEXT: s_mov_b32 s47, s36 +; GCN-NEXT: s_mov_b32 s48, s36 +; GCN-NEXT: s_mov_b32 s49, s36 +; GCN-NEXT: s_mov_b32 s50, s36 +; GCN-NEXT: s_mov_b32 s51, s36 +; GCN-NEXT: s_mov_b32 s52, s36 +; GCN-NEXT: s_mov_b32 s53, s36 +; GCN-NEXT: s_mov_b32 s54, s36 +; GCN-NEXT: s_mov_b32 s55, s36 +; GCN-NEXT: s_mov_b32 s56, s36 +; GCN-NEXT: s_mov_b32 s57, s36 +; GCN-NEXT: s_mov_b32 s58, s36 +; GCN-NEXT: s_mov_b32 s59, s36 +; GCN-NEXT: s_mov_b32 s60, s36 +; GCN-NEXT: s_mov_b32 s61, s36 +; GCN-NEXT: s_mov_b32 s62, s36 +; GCN-NEXT: s_mov_b32 s63, s36 +; GCN-NEXT: s_mov_b32 s64, s36 +; GCN-NEXT: s_mov_b32 s65, s36 +; GCN-NEXT: s_mov_b32 s66, s36 +; GCN-NEXT: s_mov_b32 s67, s36 +; GCN-NEXT: v_mov_b32_e32 v31, s36 +; GCN-NEXT: v_mov_b32_e32 v32, s37 +; GCN-NEXT: v_mov_b32_e32 v33, s38 +; GCN-NEXT: v_mov_b32_e32 v34, s39 +; GCN-NEXT: v_mov_b32_e32 v35, s40 +; GCN-NEXT: v_mov_b32_e32 v36, s41 +; GCN-NEXT: v_mov_b32_e32 v37, s42 +; GCN-NEXT: v_mov_b32_e32 v38, s43 +; GCN-NEXT: v_mov_b32_e32 v39, s44 +; GCN-NEXT: v_mov_b32_e32 v40, s45 +; GCN-NEXT: v_mov_b32_e32 v41, s46 +; GCN-NEXT: v_mov_b32_e32 v42, s47 +; GCN-NEXT: v_mov_b32_e32 v43, s48 +; GCN-NEXT: v_mov_b32_e32 v44, s49 +; GCN-NEXT: v_mov_b32_e32 v45, s50 +; GCN-NEXT: v_mov_b32_e32 v46, s51 +; GCN-NEXT: v_mov_b32_e32 v47, s52 +; GCN-NEXT: v_mov_b32_e32 v48, s53 +; GCN-NEXT: v_mov_b32_e32 v49, s54 +; GCN-NEXT: v_mov_b32_e32 v50, s55 +; GCN-NEXT: v_mov_b32_e32 v51, s56 +; GCN-NEXT: v_mov_b32_e32 v52, s57 +; GCN-NEXT: v_mov_b32_e32 v53, s58 +; GCN-NEXT: v_mov_b32_e32 v54, s59 +; GCN-NEXT: v_mov_b32_e32 v55, s60 +; GCN-NEXT: v_mov_b32_e32 v56, s61 +; GCN-NEXT: v_mov_b32_e32 v57, s62 +; GCN-NEXT: v_mov_b32_e32 v58, s63 +; GCN-NEXT: v_mov_b32_e32 v59, s64 +; GCN-NEXT: v_mov_b32_e32 v60, s65 +; GCN-NEXT: v_mov_b32_e32 v61, s66 +; GCN-NEXT: v_mov_b32_e32 v62, s67 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB143_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v36, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v37, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v38, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v39, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v40, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v41, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v42, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v43, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v44, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v45, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v46, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v47, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v48, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v27 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v10 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_alignbit_b32 v49, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v50, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v51, v16, v17, 16 +; GCN-NEXT: v_alignbit_b32 v52, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v53, v20, v0, 16 +; GCN-NEXT: v_alignbit_b32 v54, v21, v3, 16 +; GCN-NEXT: v_alignbit_b32 v55, v22, v4, 16 +; GCN-NEXT: v_alignbit_b32 v56, v23, v5, 16 +; GCN-NEXT: v_alignbit_b32 v57, v24, v6, 16 +; GCN-NEXT: v_alignbit_b32 v58, v25, v7, 16 +; GCN-NEXT: v_alignbit_b32 v59, v26, v8, 16 +; GCN-NEXT: v_alignbit_b32 v60, v27, v9, 16 +; GCN-NEXT: v_alignbit_b32 v61, v28, v10, 16 +; GCN-NEXT: v_alignbit_b32 v62, v29, v11, 16 +; GCN-NEXT: .LBB143_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s37, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s36 +; GCN-NEXT: s_mov_b32 s5, s36 +; GCN-NEXT: s_mov_b64 s[6:7], s[36:37] +; GCN-NEXT: buffer_store_dwordx4 v[59:62], v[1:2], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_store_dwordx4 v[55:58], v[1:2], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_store_dwordx4 v[51:54], v[1:2], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_store_dwordx4 v[47:50], v[1:2], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_readlane_b32 s67, v63, 15 +; GCN-NEXT: v_readlane_b32 s66, v63, 14 +; GCN-NEXT: v_readlane_b32 s65, v63, 13 +; GCN-NEXT: v_readlane_b32 s64, v63, 12 +; GCN-NEXT: v_readlane_b32 s55, v63, 11 +; GCN-NEXT: v_readlane_b32 s54, v63, 10 +; GCN-NEXT: v_readlane_b32 s53, v63, 9 +; GCN-NEXT: v_readlane_b32 s52, v63, 8 +; GCN-NEXT: v_readlane_b32 s51, v63, 7 +; GCN-NEXT: v_readlane_b32 s50, v63, 6 +; GCN-NEXT: v_readlane_b32 s49, v63, 5 +; GCN-NEXT: v_readlane_b32 s48, v63, 4 +; GCN-NEXT: v_readlane_b32 s39, v63, 3 +; GCN-NEXT: v_readlane_b32 s38, v63, 2 +; GCN-NEXT: v_readlane_b32 s37, v63, 1 +; GCN-NEXT: v_readlane_b32 s36, v63, 0 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v64bf16_to_v16i64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: v_writelane_b32 v63, s36, 0 +; VI-NEXT: v_writelane_b32 v63, s37, 1 +; VI-NEXT: v_writelane_b32 v63, s38, 2 +; VI-NEXT: v_writelane_b32 v63, s39, 3 +; VI-NEXT: v_writelane_b32 v63, s48, 4 +; VI-NEXT: v_writelane_b32 v63, s49, 5 +; VI-NEXT: v_writelane_b32 v63, s50, 6 +; VI-NEXT: v_writelane_b32 v63, s51, 7 +; VI-NEXT: v_writelane_b32 v63, s52, 8 +; VI-NEXT: v_writelane_b32 v63, s53, 9 +; VI-NEXT: v_writelane_b32 v63, s54, 10 +; VI-NEXT: v_writelane_b32 v63, s55, 11 +; VI-NEXT: v_writelane_b32 v63, s64, 12 +; VI-NEXT: v_writelane_b32 v63, s65, 13 +; VI-NEXT: v_writelane_b32 v63, s66, 14 +; VI-NEXT: v_writelane_b32 v63, s67, 15 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s36, 0 +; VI-NEXT: s_mov_b32 s37, s36 +; VI-NEXT: s_mov_b32 s38, s36 +; VI-NEXT: s_mov_b32 s39, s36 +; VI-NEXT: s_mov_b32 s40, s36 +; VI-NEXT: s_mov_b32 s41, s36 +; VI-NEXT: s_mov_b32 s42, s36 +; VI-NEXT: s_mov_b32 s43, s36 +; VI-NEXT: s_mov_b32 s44, s36 +; VI-NEXT: s_mov_b32 s45, s36 +; VI-NEXT: s_mov_b32 s46, s36 +; VI-NEXT: s_mov_b32 s47, s36 +; VI-NEXT: s_mov_b32 s48, s36 +; VI-NEXT: s_mov_b32 s49, s36 +; VI-NEXT: s_mov_b32 s50, s36 +; VI-NEXT: s_mov_b32 s51, s36 +; VI-NEXT: s_mov_b32 s52, s36 +; VI-NEXT: s_mov_b32 s53, s36 +; VI-NEXT: s_mov_b32 s54, s36 +; VI-NEXT: s_mov_b32 s55, s36 +; VI-NEXT: s_mov_b32 s56, s36 +; VI-NEXT: s_mov_b32 s57, s36 +; VI-NEXT: s_mov_b32 s58, s36 +; VI-NEXT: s_mov_b32 s59, s36 +; VI-NEXT: s_mov_b32 s60, s36 +; VI-NEXT: s_mov_b32 s61, s36 +; VI-NEXT: s_mov_b32 s62, s36 +; VI-NEXT: s_mov_b32 s63, s36 +; VI-NEXT: s_mov_b32 s64, s36 +; VI-NEXT: s_mov_b32 s65, s36 +; VI-NEXT: s_mov_b32 s66, s36 +; VI-NEXT: s_mov_b32 s67, s36 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v31, s36 +; VI-NEXT: v_mov_b32_e32 v32, s37 +; VI-NEXT: v_mov_b32_e32 v33, s38 +; VI-NEXT: v_mov_b32_e32 v34, s39 +; VI-NEXT: v_mov_b32_e32 v35, s40 +; VI-NEXT: v_mov_b32_e32 v36, s41 +; VI-NEXT: v_mov_b32_e32 v37, s42 +; VI-NEXT: v_mov_b32_e32 v38, s43 +; VI-NEXT: v_mov_b32_e32 v39, s44 +; VI-NEXT: v_mov_b32_e32 v40, s45 +; VI-NEXT: v_mov_b32_e32 v41, s46 +; VI-NEXT: v_mov_b32_e32 v42, s47 +; VI-NEXT: v_mov_b32_e32 v43, s48 +; VI-NEXT: v_mov_b32_e32 v44, s49 +; VI-NEXT: v_mov_b32_e32 v45, s50 +; VI-NEXT: v_mov_b32_e32 v46, s51 +; VI-NEXT: v_mov_b32_e32 v47, s52 +; VI-NEXT: v_mov_b32_e32 v48, s53 +; VI-NEXT: v_mov_b32_e32 v49, s54 +; VI-NEXT: v_mov_b32_e32 v50, s55 +; VI-NEXT: v_mov_b32_e32 v51, s56 +; VI-NEXT: v_mov_b32_e32 v52, s57 +; VI-NEXT: v_mov_b32_e32 v53, s58 +; VI-NEXT: v_mov_b32_e32 v54, s59 +; VI-NEXT: v_mov_b32_e32 v55, s60 +; VI-NEXT: v_mov_b32_e32 v56, s61 +; VI-NEXT: v_mov_b32_e32 v57, s62 +; VI-NEXT: v_mov_b32_e32 v58, s63 +; VI-NEXT: v_mov_b32_e32 v59, s64 +; VI-NEXT: v_mov_b32_e32 v60, s65 +; VI-NEXT: v_mov_b32_e32 v61, s66 +; VI-NEXT: v_mov_b32_e32 v62, s67 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB143_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: .LBB143_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x70, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: v_readlane_b32 s67, v63, 15 +; VI-NEXT: v_readlane_b32 s66, v63, 14 +; VI-NEXT: v_readlane_b32 s65, v63, 13 +; VI-NEXT: v_readlane_b32 s64, v63, 12 +; VI-NEXT: v_readlane_b32 s55, v63, 11 +; VI-NEXT: v_readlane_b32 s54, v63, 10 +; VI-NEXT: v_readlane_b32 s53, v63, 9 +; VI-NEXT: v_readlane_b32 s52, v63, 8 +; VI-NEXT: v_readlane_b32 s51, v63, 7 +; VI-NEXT: v_readlane_b32 s50, v63, 6 +; VI-NEXT: v_readlane_b32 s49, v63, 5 +; VI-NEXT: v_readlane_b32 s48, v63, 4 +; VI-NEXT: v_readlane_b32 s39, v63, 3 +; VI-NEXT: v_readlane_b32 s38, v63, 2 +; VI-NEXT: v_readlane_b32 s37, v63, 1 +; VI-NEXT: v_readlane_b32 s36, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dwordx4 v[3:4], v[33:36] +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x60, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[29:32] +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x50, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[25:28] +; VI-NEXT: v_add_u32_e32 v3, vcc, 64, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[21:24] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[5:8] +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v64bf16_to_v16i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v63, s36, 0 +; GFX9-NEXT: v_writelane_b32 v63, s37, 1 +; GFX9-NEXT: v_writelane_b32 v63, s38, 2 +; GFX9-NEXT: v_writelane_b32 v63, s39, 3 +; GFX9-NEXT: v_writelane_b32 v63, s48, 4 +; GFX9-NEXT: v_writelane_b32 v63, s49, 5 +; GFX9-NEXT: v_writelane_b32 v63, s50, 6 +; GFX9-NEXT: v_writelane_b32 v63, s51, 7 +; GFX9-NEXT: v_writelane_b32 v63, s52, 8 +; GFX9-NEXT: v_writelane_b32 v63, s53, 9 +; GFX9-NEXT: v_writelane_b32 v63, s54, 10 +; GFX9-NEXT: v_writelane_b32 v63, s55, 11 +; GFX9-NEXT: v_writelane_b32 v63, s64, 12 +; GFX9-NEXT: v_writelane_b32 v63, s65, 13 +; GFX9-NEXT: v_writelane_b32 v63, s66, 14 +; GFX9-NEXT: v_writelane_b32 v63, s67, 15 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s36, 0 +; GFX9-NEXT: s_mov_b32 s37, s36 +; GFX9-NEXT: s_mov_b32 s38, s36 +; GFX9-NEXT: s_mov_b32 s39, s36 +; GFX9-NEXT: s_mov_b32 s40, s36 +; GFX9-NEXT: s_mov_b32 s41, s36 +; GFX9-NEXT: s_mov_b32 s42, s36 +; GFX9-NEXT: s_mov_b32 s43, s36 +; GFX9-NEXT: s_mov_b32 s44, s36 +; GFX9-NEXT: s_mov_b32 s45, s36 +; GFX9-NEXT: s_mov_b32 s46, s36 +; GFX9-NEXT: s_mov_b32 s47, s36 +; GFX9-NEXT: s_mov_b32 s48, s36 +; GFX9-NEXT: s_mov_b32 s49, s36 +; GFX9-NEXT: s_mov_b32 s50, s36 +; GFX9-NEXT: s_mov_b32 s51, s36 +; GFX9-NEXT: s_mov_b32 s52, s36 +; GFX9-NEXT: s_mov_b32 s53, s36 +; GFX9-NEXT: s_mov_b32 s54, s36 +; GFX9-NEXT: s_mov_b32 s55, s36 +; GFX9-NEXT: s_mov_b32 s56, s36 +; GFX9-NEXT: s_mov_b32 s57, s36 +; GFX9-NEXT: s_mov_b32 s58, s36 +; GFX9-NEXT: s_mov_b32 s59, s36 +; GFX9-NEXT: s_mov_b32 s60, s36 +; GFX9-NEXT: s_mov_b32 s61, s36 +; GFX9-NEXT: s_mov_b32 s62, s36 +; GFX9-NEXT: s_mov_b32 s63, s36 +; GFX9-NEXT: s_mov_b32 s64, s36 +; GFX9-NEXT: s_mov_b32 s65, s36 +; GFX9-NEXT: s_mov_b32 s66, s36 +; GFX9-NEXT: s_mov_b32 s67, s36 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v31, s36 +; GFX9-NEXT: v_mov_b32_e32 v32, s37 +; GFX9-NEXT: v_mov_b32_e32 v33, s38 +; GFX9-NEXT: v_mov_b32_e32 v34, s39 +; GFX9-NEXT: v_mov_b32_e32 v35, s40 +; GFX9-NEXT: v_mov_b32_e32 v36, s41 +; GFX9-NEXT: v_mov_b32_e32 v37, s42 +; GFX9-NEXT: v_mov_b32_e32 v38, s43 +; GFX9-NEXT: v_mov_b32_e32 v39, s44 +; GFX9-NEXT: v_mov_b32_e32 v40, s45 +; GFX9-NEXT: v_mov_b32_e32 v41, s46 +; GFX9-NEXT: v_mov_b32_e32 v42, s47 +; GFX9-NEXT: v_mov_b32_e32 v43, s48 +; GFX9-NEXT: v_mov_b32_e32 v44, s49 +; GFX9-NEXT: v_mov_b32_e32 v45, s50 +; GFX9-NEXT: v_mov_b32_e32 v46, s51 +; GFX9-NEXT: v_mov_b32_e32 v47, s52 +; GFX9-NEXT: v_mov_b32_e32 v48, s53 +; GFX9-NEXT: v_mov_b32_e32 v49, s54 +; GFX9-NEXT: v_mov_b32_e32 v50, s55 +; GFX9-NEXT: v_mov_b32_e32 v51, s56 +; GFX9-NEXT: v_mov_b32_e32 v52, s57 +; GFX9-NEXT: v_mov_b32_e32 v53, s58 +; GFX9-NEXT: v_mov_b32_e32 v54, s59 +; GFX9-NEXT: v_mov_b32_e32 v55, s60 +; GFX9-NEXT: v_mov_b32_e32 v56, s61 +; GFX9-NEXT: v_mov_b32_e32 v57, s62 +; GFX9-NEXT: v_mov_b32_e32 v58, s63 +; GFX9-NEXT: v_mov_b32_e32 v59, s64 +; GFX9-NEXT: v_mov_b32_e32 v60, s65 +; GFX9-NEXT: v_mov_b32_e32 v61, s66 +; GFX9-NEXT: v_mov_b32_e32 v62, s67 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB143_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB143_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s67, v63, 15 +; GFX9-NEXT: v_readlane_b32 s66, v63, 14 +; GFX9-NEXT: v_readlane_b32 s65, v63, 13 +; GFX9-NEXT: v_readlane_b32 s64, v63, 12 +; GFX9-NEXT: v_readlane_b32 s55, v63, 11 +; GFX9-NEXT: v_readlane_b32 s54, v63, 10 +; GFX9-NEXT: v_readlane_b32 s53, v63, 9 +; GFX9-NEXT: v_readlane_b32 s52, v63, 8 +; GFX9-NEXT: v_readlane_b32 s51, v63, 7 +; GFX9-NEXT: v_readlane_b32 s50, v63, 6 +; GFX9-NEXT: v_readlane_b32 s49, v63, 5 +; GFX9-NEXT: v_readlane_b32 s48, v63, 4 +; GFX9-NEXT: v_readlane_b32 s39, v63, 3 +; GFX9-NEXT: v_readlane_b32 s38, v63, 2 +; GFX9-NEXT: v_readlane_b32 s37, v63, 1 +; GFX9-NEXT: v_readlane_b32 s36, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:96 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:80 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off offset:64 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v64bf16_to_v16i64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v67, s32 offset:80 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 +; GFX11-NEXT: v_writelane_b32 v67, s30, 0 +; GFX11-NEXT: v_writelane_b32 v67, s31, 1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s31, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s16, s0 +; GFX11-NEXT: s_mov_b32 s17, s0 +; GFX11-NEXT: s_mov_b32 s18, s0 +; GFX11-NEXT: s_mov_b32 s19, s0 +; GFX11-NEXT: s_mov_b32 s20, s0 +; GFX11-NEXT: s_mov_b32 s21, s0 +; GFX11-NEXT: s_mov_b32 s22, s0 +; GFX11-NEXT: s_mov_b32 s23, s0 +; GFX11-NEXT: s_mov_b32 s24, s0 +; GFX11-NEXT: s_mov_b32 s25, s0 +; GFX11-NEXT: s_mov_b32 s26, s0 +; GFX11-NEXT: s_mov_b32 s27, s0 +; GFX11-NEXT: s_mov_b32 s28, s0 +; GFX11-NEXT: s_mov_b32 s29, s0 +; GFX11-NEXT: s_mov_b32 s30, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v66, s31 :: v_dual_mov_b32 v65, s30 +; GFX11-NEXT: v_dual_mov_b32 v64, s29 :: v_dual_mov_b32 v63, s28 +; GFX11-NEXT: v_dual_mov_b32 v62, s27 :: v_dual_mov_b32 v61, s26 +; GFX11-NEXT: v_dual_mov_b32 v60, s25 :: v_dual_mov_b32 v59, s24 +; GFX11-NEXT: v_dual_mov_b32 v58, s23 :: v_dual_mov_b32 v57, s22 +; GFX11-NEXT: v_dual_mov_b32 v56, s21 :: v_dual_mov_b32 v55, s20 +; GFX11-NEXT: v_dual_mov_b32 v54, s19 :: v_dual_mov_b32 v53, s18 +; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v51, s16 +; GFX11-NEXT: v_dual_mov_b32 v50, s15 :: v_dual_mov_b32 v49, s14 +; GFX11-NEXT: v_dual_mov_b32 v48, s13 :: v_dual_mov_b32 v47, s12 +; GFX11-NEXT: v_dual_mov_b32 v46, s11 :: v_dual_mov_b32 v45, s10 +; GFX11-NEXT: v_dual_mov_b32 v44, s9 :: v_dual_mov_b32 v43, s8 +; GFX11-NEXT: v_dual_mov_b32 v42, s7 :: v_dual_mov_b32 v41, s6 +; GFX11-NEXT: v_dual_mov_b32 v40, s5 :: v_dual_mov_b32 v39, s4 +; GFX11-NEXT: v_dual_mov_b32 v38, s3 :: v_dual_mov_b32 v37, s2 +; GFX11-NEXT: v_dual_mov_b32 v36, s1 :: v_dual_mov_b32 v35, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB143_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33 +; GFX11-NEXT: v_dual_mov_b32 v64, v32 :: v_dual_mov_b32 v63, v31 +; GFX11-NEXT: v_dual_mov_b32 v62, v30 :: v_dual_mov_b32 v61, v29 +; GFX11-NEXT: v_dual_mov_b32 v60, v28 :: v_dual_mov_b32 v59, v27 +; GFX11-NEXT: v_dual_mov_b32 v58, v26 :: v_dual_mov_b32 v57, v25 +; GFX11-NEXT: v_dual_mov_b32 v56, v24 :: v_dual_mov_b32 v55, v23 +; GFX11-NEXT: v_dual_mov_b32 v54, v22 :: v_dual_mov_b32 v53, v21 +; GFX11-NEXT: v_dual_mov_b32 v52, v20 :: v_dual_mov_b32 v51, v19 +; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v49, v17 +; GFX11-NEXT: v_dual_mov_b32 v48, v16 :: v_dual_mov_b32 v47, v15 +; GFX11-NEXT: v_dual_mov_b32 v46, v14 :: v_dual_mov_b32 v45, v13 +; GFX11-NEXT: v_dual_mov_b32 v44, v12 :: v_dual_mov_b32 v43, v11 +; GFX11-NEXT: v_dual_mov_b32 v42, v10 :: v_dual_mov_b32 v41, v9 +; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7 +; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5 +; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3 +; GFX11-NEXT: .LBB143_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112 +; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:96 +; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:80 +; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:64 +; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 +; GFX11-NEXT: v_readlane_b32 s30, v67, 0 +; GFX11-NEXT: v_readlane_b32 s31, v67, 1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:80 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end @@ -2097,8 +35670,1330 @@ end: ret void } -; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v16f64: + define void @v_bitcast_v64bf16_to_v16f64(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) { +; GCN-LABEL: v_bitcast_v64bf16_to_v16f64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v63, s36, 0 +; GCN-NEXT: v_writelane_b32 v63, s37, 1 +; GCN-NEXT: v_writelane_b32 v63, s38, 2 +; GCN-NEXT: v_writelane_b32 v63, s39, 3 +; GCN-NEXT: v_writelane_b32 v63, s48, 4 +; GCN-NEXT: v_writelane_b32 v63, s49, 5 +; GCN-NEXT: v_writelane_b32 v63, s50, 6 +; GCN-NEXT: v_writelane_b32 v63, s51, 7 +; GCN-NEXT: v_writelane_b32 v63, s52, 8 +; GCN-NEXT: v_writelane_b32 v63, s53, 9 +; GCN-NEXT: v_writelane_b32 v63, s54, 10 +; GCN-NEXT: v_writelane_b32 v63, s55, 11 +; GCN-NEXT: v_writelane_b32 v63, s64, 12 +; GCN-NEXT: v_writelane_b32 v63, s65, 13 +; GCN-NEXT: v_writelane_b32 v63, s66, 14 +; GCN-NEXT: v_writelane_b32 v63, s67, 15 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 +; GCN-NEXT: s_mov_b32 s36, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b32 s37, s36 +; GCN-NEXT: s_mov_b32 s38, s36 +; GCN-NEXT: s_mov_b32 s39, s36 +; GCN-NEXT: s_mov_b32 s40, s36 +; GCN-NEXT: s_mov_b32 s41, s36 +; GCN-NEXT: s_mov_b32 s42, s36 +; GCN-NEXT: s_mov_b32 s43, s36 +; GCN-NEXT: s_mov_b32 s44, s36 +; GCN-NEXT: s_mov_b32 s45, s36 +; GCN-NEXT: s_mov_b32 s46, s36 +; GCN-NEXT: s_mov_b32 s47, s36 +; GCN-NEXT: s_mov_b32 s48, s36 +; GCN-NEXT: s_mov_b32 s49, s36 +; GCN-NEXT: s_mov_b32 s50, s36 +; GCN-NEXT: s_mov_b32 s51, s36 +; GCN-NEXT: s_mov_b32 s52, s36 +; GCN-NEXT: s_mov_b32 s53, s36 +; GCN-NEXT: s_mov_b32 s54, s36 +; GCN-NEXT: s_mov_b32 s55, s36 +; GCN-NEXT: s_mov_b32 s56, s36 +; GCN-NEXT: s_mov_b32 s57, s36 +; GCN-NEXT: s_mov_b32 s58, s36 +; GCN-NEXT: s_mov_b32 s59, s36 +; GCN-NEXT: s_mov_b32 s60, s36 +; GCN-NEXT: s_mov_b32 s61, s36 +; GCN-NEXT: s_mov_b32 s62, s36 +; GCN-NEXT: s_mov_b32 s63, s36 +; GCN-NEXT: s_mov_b32 s64, s36 +; GCN-NEXT: s_mov_b32 s65, s36 +; GCN-NEXT: s_mov_b32 s66, s36 +; GCN-NEXT: s_mov_b32 s67, s36 +; GCN-NEXT: v_mov_b32_e32 v31, s36 +; GCN-NEXT: v_mov_b32_e32 v32, s37 +; GCN-NEXT: v_mov_b32_e32 v33, s38 +; GCN-NEXT: v_mov_b32_e32 v34, s39 +; GCN-NEXT: v_mov_b32_e32 v35, s40 +; GCN-NEXT: v_mov_b32_e32 v36, s41 +; GCN-NEXT: v_mov_b32_e32 v37, s42 +; GCN-NEXT: v_mov_b32_e32 v38, s43 +; GCN-NEXT: v_mov_b32_e32 v39, s44 +; GCN-NEXT: v_mov_b32_e32 v40, s45 +; GCN-NEXT: v_mov_b32_e32 v41, s46 +; GCN-NEXT: v_mov_b32_e32 v42, s47 +; GCN-NEXT: v_mov_b32_e32 v43, s48 +; GCN-NEXT: v_mov_b32_e32 v44, s49 +; GCN-NEXT: v_mov_b32_e32 v45, s50 +; GCN-NEXT: v_mov_b32_e32 v46, s51 +; GCN-NEXT: v_mov_b32_e32 v47, s52 +; GCN-NEXT: v_mov_b32_e32 v48, s53 +; GCN-NEXT: v_mov_b32_e32 v49, s54 +; GCN-NEXT: v_mov_b32_e32 v50, s55 +; GCN-NEXT: v_mov_b32_e32 v51, s56 +; GCN-NEXT: v_mov_b32_e32 v52, s57 +; GCN-NEXT: v_mov_b32_e32 v53, s58 +; GCN-NEXT: v_mov_b32_e32 v54, s59 +; GCN-NEXT: v_mov_b32_e32 v55, s60 +; GCN-NEXT: v_mov_b32_e32 v56, s61 +; GCN-NEXT: v_mov_b32_e32 v57, s62 +; GCN-NEXT: v_mov_b32_e32 v58, s63 +; GCN-NEXT: v_mov_b32_e32 v59, s64 +; GCN-NEXT: v_mov_b32_e32 v60, s65 +; GCN-NEXT: v_mov_b32_e32 v61, s66 +; GCN-NEXT: v_mov_b32_e32 v62, s67 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB144_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 +; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v36, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v37, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v38, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v39, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v40, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v41, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v42, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v43, v0, v3, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v44, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v45, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v46, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v47, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v48, v0, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v27 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v10 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_alignbit_b32 v49, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v50, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v51, v16, v17, 16 +; GCN-NEXT: v_alignbit_b32 v52, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v53, v20, v0, 16 +; GCN-NEXT: v_alignbit_b32 v54, v21, v3, 16 +; GCN-NEXT: v_alignbit_b32 v55, v22, v4, 16 +; GCN-NEXT: v_alignbit_b32 v56, v23, v5, 16 +; GCN-NEXT: v_alignbit_b32 v57, v24, v6, 16 +; GCN-NEXT: v_alignbit_b32 v58, v25, v7, 16 +; GCN-NEXT: v_alignbit_b32 v59, v26, v8, 16 +; GCN-NEXT: v_alignbit_b32 v60, v27, v9, 16 +; GCN-NEXT: v_alignbit_b32 v61, v28, v10, 16 +; GCN-NEXT: v_alignbit_b32 v62, v29, v11, 16 +; GCN-NEXT: .LBB144_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s37, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s36 +; GCN-NEXT: s_mov_b32 s5, s36 +; GCN-NEXT: s_mov_b64 s[6:7], s[36:37] +; GCN-NEXT: buffer_store_dwordx4 v[59:62], v[1:2], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_store_dwordx4 v[55:58], v[1:2], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_store_dwordx4 v[51:54], v[1:2], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_store_dwordx4 v[47:50], v[1:2], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_readlane_b32 s67, v63, 15 +; GCN-NEXT: v_readlane_b32 s66, v63, 14 +; GCN-NEXT: v_readlane_b32 s65, v63, 13 +; GCN-NEXT: v_readlane_b32 s64, v63, 12 +; GCN-NEXT: v_readlane_b32 s55, v63, 11 +; GCN-NEXT: v_readlane_b32 s54, v63, 10 +; GCN-NEXT: v_readlane_b32 s53, v63, 9 +; GCN-NEXT: v_readlane_b32 s52, v63, 8 +; GCN-NEXT: v_readlane_b32 s51, v63, 7 +; GCN-NEXT: v_readlane_b32 s50, v63, 6 +; GCN-NEXT: v_readlane_b32 s49, v63, 5 +; GCN-NEXT: v_readlane_b32 s48, v63, 4 +; GCN-NEXT: v_readlane_b32 s39, v63, 3 +; GCN-NEXT: v_readlane_b32 s38, v63, 2 +; GCN-NEXT: v_readlane_b32 s37, v63, 1 +; GCN-NEXT: v_readlane_b32 s36, v63, 0 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bitcast_v64bf16_to_v16f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: v_writelane_b32 v63, s36, 0 +; VI-NEXT: v_writelane_b32 v63, s37, 1 +; VI-NEXT: v_writelane_b32 v63, s38, 2 +; VI-NEXT: v_writelane_b32 v63, s39, 3 +; VI-NEXT: v_writelane_b32 v63, s48, 4 +; VI-NEXT: v_writelane_b32 v63, s49, 5 +; VI-NEXT: v_writelane_b32 v63, s50, 6 +; VI-NEXT: v_writelane_b32 v63, s51, 7 +; VI-NEXT: v_writelane_b32 v63, s52, 8 +; VI-NEXT: v_writelane_b32 v63, s53, 9 +; VI-NEXT: v_writelane_b32 v63, s54, 10 +; VI-NEXT: v_writelane_b32 v63, s55, 11 +; VI-NEXT: v_writelane_b32 v63, s64, 12 +; VI-NEXT: v_writelane_b32 v63, s65, 13 +; VI-NEXT: v_writelane_b32 v63, s66, 14 +; VI-NEXT: v_writelane_b32 v63, s67, 15 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s36, 0 +; VI-NEXT: s_mov_b32 s37, s36 +; VI-NEXT: s_mov_b32 s38, s36 +; VI-NEXT: s_mov_b32 s39, s36 +; VI-NEXT: s_mov_b32 s40, s36 +; VI-NEXT: s_mov_b32 s41, s36 +; VI-NEXT: s_mov_b32 s42, s36 +; VI-NEXT: s_mov_b32 s43, s36 +; VI-NEXT: s_mov_b32 s44, s36 +; VI-NEXT: s_mov_b32 s45, s36 +; VI-NEXT: s_mov_b32 s46, s36 +; VI-NEXT: s_mov_b32 s47, s36 +; VI-NEXT: s_mov_b32 s48, s36 +; VI-NEXT: s_mov_b32 s49, s36 +; VI-NEXT: s_mov_b32 s50, s36 +; VI-NEXT: s_mov_b32 s51, s36 +; VI-NEXT: s_mov_b32 s52, s36 +; VI-NEXT: s_mov_b32 s53, s36 +; VI-NEXT: s_mov_b32 s54, s36 +; VI-NEXT: s_mov_b32 s55, s36 +; VI-NEXT: s_mov_b32 s56, s36 +; VI-NEXT: s_mov_b32 s57, s36 +; VI-NEXT: s_mov_b32 s58, s36 +; VI-NEXT: s_mov_b32 s59, s36 +; VI-NEXT: s_mov_b32 s60, s36 +; VI-NEXT: s_mov_b32 s61, s36 +; VI-NEXT: s_mov_b32 s62, s36 +; VI-NEXT: s_mov_b32 s63, s36 +; VI-NEXT: s_mov_b32 s64, s36 +; VI-NEXT: s_mov_b32 s65, s36 +; VI-NEXT: s_mov_b32 s66, s36 +; VI-NEXT: s_mov_b32 s67, s36 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v31, s36 +; VI-NEXT: v_mov_b32_e32 v32, s37 +; VI-NEXT: v_mov_b32_e32 v33, s38 +; VI-NEXT: v_mov_b32_e32 v34, s39 +; VI-NEXT: v_mov_b32_e32 v35, s40 +; VI-NEXT: v_mov_b32_e32 v36, s41 +; VI-NEXT: v_mov_b32_e32 v37, s42 +; VI-NEXT: v_mov_b32_e32 v38, s43 +; VI-NEXT: v_mov_b32_e32 v39, s44 +; VI-NEXT: v_mov_b32_e32 v40, s45 +; VI-NEXT: v_mov_b32_e32 v41, s46 +; VI-NEXT: v_mov_b32_e32 v42, s47 +; VI-NEXT: v_mov_b32_e32 v43, s48 +; VI-NEXT: v_mov_b32_e32 v44, s49 +; VI-NEXT: v_mov_b32_e32 v45, s50 +; VI-NEXT: v_mov_b32_e32 v46, s51 +; VI-NEXT: v_mov_b32_e32 v47, s52 +; VI-NEXT: v_mov_b32_e32 v48, s53 +; VI-NEXT: v_mov_b32_e32 v49, s54 +; VI-NEXT: v_mov_b32_e32 v50, s55 +; VI-NEXT: v_mov_b32_e32 v51, s56 +; VI-NEXT: v_mov_b32_e32 v52, s57 +; VI-NEXT: v_mov_b32_e32 v53, s58 +; VI-NEXT: v_mov_b32_e32 v54, s59 +; VI-NEXT: v_mov_b32_e32 v55, s60 +; VI-NEXT: v_mov_b32_e32 v56, s61 +; VI-NEXT: v_mov_b32_e32 v57, s62 +; VI-NEXT: v_mov_b32_e32 v58, s63 +; VI-NEXT: v_mov_b32_e32 v59, s64 +; VI-NEXT: v_mov_b32_e32 v60, s65 +; VI-NEXT: v_mov_b32_e32 v61, s66 +; VI-NEXT: v_mov_b32_e32 v62, s67 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB144_2 +; VI-NEXT: ; %bb.1: ; %if +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: .LBB144_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x70, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: v_readlane_b32 s67, v63, 15 +; VI-NEXT: v_readlane_b32 s66, v63, 14 +; VI-NEXT: v_readlane_b32 s65, v63, 13 +; VI-NEXT: v_readlane_b32 s64, v63, 12 +; VI-NEXT: v_readlane_b32 s55, v63, 11 +; VI-NEXT: v_readlane_b32 s54, v63, 10 +; VI-NEXT: v_readlane_b32 s53, v63, 9 +; VI-NEXT: v_readlane_b32 s52, v63, 8 +; VI-NEXT: v_readlane_b32 s51, v63, 7 +; VI-NEXT: v_readlane_b32 s50, v63, 6 +; VI-NEXT: v_readlane_b32 s49, v63, 5 +; VI-NEXT: v_readlane_b32 s48, v63, 4 +; VI-NEXT: v_readlane_b32 s39, v63, 3 +; VI-NEXT: v_readlane_b32 s38, v63, 2 +; VI-NEXT: v_readlane_b32 s37, v63, 1 +; VI-NEXT: v_readlane_b32 s36, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dwordx4 v[3:4], v[33:36] +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x60, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[29:32] +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x50, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[25:28] +; VI-NEXT: v_add_u32_e32 v3, vcc, 64, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[21:24] +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] +; VI-NEXT: flat_store_dwordx4 v[1:2], v[5:8] +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bitcast_v64bf16_to_v16f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v63, s36, 0 +; GFX9-NEXT: v_writelane_b32 v63, s37, 1 +; GFX9-NEXT: v_writelane_b32 v63, s38, 2 +; GFX9-NEXT: v_writelane_b32 v63, s39, 3 +; GFX9-NEXT: v_writelane_b32 v63, s48, 4 +; GFX9-NEXT: v_writelane_b32 v63, s49, 5 +; GFX9-NEXT: v_writelane_b32 v63, s50, 6 +; GFX9-NEXT: v_writelane_b32 v63, s51, 7 +; GFX9-NEXT: v_writelane_b32 v63, s52, 8 +; GFX9-NEXT: v_writelane_b32 v63, s53, 9 +; GFX9-NEXT: v_writelane_b32 v63, s54, 10 +; GFX9-NEXT: v_writelane_b32 v63, s55, 11 +; GFX9-NEXT: v_writelane_b32 v63, s64, 12 +; GFX9-NEXT: v_writelane_b32 v63, s65, 13 +; GFX9-NEXT: v_writelane_b32 v63, s66, 14 +; GFX9-NEXT: v_writelane_b32 v63, s67, 15 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s36, 0 +; GFX9-NEXT: s_mov_b32 s37, s36 +; GFX9-NEXT: s_mov_b32 s38, s36 +; GFX9-NEXT: s_mov_b32 s39, s36 +; GFX9-NEXT: s_mov_b32 s40, s36 +; GFX9-NEXT: s_mov_b32 s41, s36 +; GFX9-NEXT: s_mov_b32 s42, s36 +; GFX9-NEXT: s_mov_b32 s43, s36 +; GFX9-NEXT: s_mov_b32 s44, s36 +; GFX9-NEXT: s_mov_b32 s45, s36 +; GFX9-NEXT: s_mov_b32 s46, s36 +; GFX9-NEXT: s_mov_b32 s47, s36 +; GFX9-NEXT: s_mov_b32 s48, s36 +; GFX9-NEXT: s_mov_b32 s49, s36 +; GFX9-NEXT: s_mov_b32 s50, s36 +; GFX9-NEXT: s_mov_b32 s51, s36 +; GFX9-NEXT: s_mov_b32 s52, s36 +; GFX9-NEXT: s_mov_b32 s53, s36 +; GFX9-NEXT: s_mov_b32 s54, s36 +; GFX9-NEXT: s_mov_b32 s55, s36 +; GFX9-NEXT: s_mov_b32 s56, s36 +; GFX9-NEXT: s_mov_b32 s57, s36 +; GFX9-NEXT: s_mov_b32 s58, s36 +; GFX9-NEXT: s_mov_b32 s59, s36 +; GFX9-NEXT: s_mov_b32 s60, s36 +; GFX9-NEXT: s_mov_b32 s61, s36 +; GFX9-NEXT: s_mov_b32 s62, s36 +; GFX9-NEXT: s_mov_b32 s63, s36 +; GFX9-NEXT: s_mov_b32 s64, s36 +; GFX9-NEXT: s_mov_b32 s65, s36 +; GFX9-NEXT: s_mov_b32 s66, s36 +; GFX9-NEXT: s_mov_b32 s67, s36 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v31, s36 +; GFX9-NEXT: v_mov_b32_e32 v32, s37 +; GFX9-NEXT: v_mov_b32_e32 v33, s38 +; GFX9-NEXT: v_mov_b32_e32 v34, s39 +; GFX9-NEXT: v_mov_b32_e32 v35, s40 +; GFX9-NEXT: v_mov_b32_e32 v36, s41 +; GFX9-NEXT: v_mov_b32_e32 v37, s42 +; GFX9-NEXT: v_mov_b32_e32 v38, s43 +; GFX9-NEXT: v_mov_b32_e32 v39, s44 +; GFX9-NEXT: v_mov_b32_e32 v40, s45 +; GFX9-NEXT: v_mov_b32_e32 v41, s46 +; GFX9-NEXT: v_mov_b32_e32 v42, s47 +; GFX9-NEXT: v_mov_b32_e32 v43, s48 +; GFX9-NEXT: v_mov_b32_e32 v44, s49 +; GFX9-NEXT: v_mov_b32_e32 v45, s50 +; GFX9-NEXT: v_mov_b32_e32 v46, s51 +; GFX9-NEXT: v_mov_b32_e32 v47, s52 +; GFX9-NEXT: v_mov_b32_e32 v48, s53 +; GFX9-NEXT: v_mov_b32_e32 v49, s54 +; GFX9-NEXT: v_mov_b32_e32 v50, s55 +; GFX9-NEXT: v_mov_b32_e32 v51, s56 +; GFX9-NEXT: v_mov_b32_e32 v52, s57 +; GFX9-NEXT: v_mov_b32_e32 v53, s58 +; GFX9-NEXT: v_mov_b32_e32 v54, s59 +; GFX9-NEXT: v_mov_b32_e32 v55, s60 +; GFX9-NEXT: v_mov_b32_e32 v56, s61 +; GFX9-NEXT: v_mov_b32_e32 v57, s62 +; GFX9-NEXT: v_mov_b32_e32 v58, s63 +; GFX9-NEXT: v_mov_b32_e32 v59, s64 +; GFX9-NEXT: v_mov_b32_e32 v60, s65 +; GFX9-NEXT: v_mov_b32_e32 v61, s66 +; GFX9-NEXT: v_mov_b32_e32 v62, s67 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB144_2 +; GFX9-NEXT: ; %bb.1: ; %if +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB144_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s67, v63, 15 +; GFX9-NEXT: v_readlane_b32 s66, v63, 14 +; GFX9-NEXT: v_readlane_b32 s65, v63, 13 +; GFX9-NEXT: v_readlane_b32 s64, v63, 12 +; GFX9-NEXT: v_readlane_b32 s55, v63, 11 +; GFX9-NEXT: v_readlane_b32 s54, v63, 10 +; GFX9-NEXT: v_readlane_b32 s53, v63, 9 +; GFX9-NEXT: v_readlane_b32 s52, v63, 8 +; GFX9-NEXT: v_readlane_b32 s51, v63, 7 +; GFX9-NEXT: v_readlane_b32 s50, v63, 6 +; GFX9-NEXT: v_readlane_b32 s49, v63, 5 +; GFX9-NEXT: v_readlane_b32 s48, v63, 4 +; GFX9-NEXT: v_readlane_b32 s39, v63, 3 +; GFX9-NEXT: v_readlane_b32 s38, v63, 2 +; GFX9-NEXT: v_readlane_b32 s37, v63, 1 +; GFX9-NEXT: v_readlane_b32 s36, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:96 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:80 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off offset:64 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_bitcast_v64bf16_to_v16f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v67, s32 offset:80 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 +; GFX11-NEXT: ; meta instruction +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 +; GFX11-NEXT: v_writelane_b32 v67, s30, 0 +; GFX11-NEXT: v_writelane_b32 v67, s31, 1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s31, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s16, s0 +; GFX11-NEXT: s_mov_b32 s17, s0 +; GFX11-NEXT: s_mov_b32 s18, s0 +; GFX11-NEXT: s_mov_b32 s19, s0 +; GFX11-NEXT: s_mov_b32 s20, s0 +; GFX11-NEXT: s_mov_b32 s21, s0 +; GFX11-NEXT: s_mov_b32 s22, s0 +; GFX11-NEXT: s_mov_b32 s23, s0 +; GFX11-NEXT: s_mov_b32 s24, s0 +; GFX11-NEXT: s_mov_b32 s25, s0 +; GFX11-NEXT: s_mov_b32 s26, s0 +; GFX11-NEXT: s_mov_b32 s27, s0 +; GFX11-NEXT: s_mov_b32 s28, s0 +; GFX11-NEXT: s_mov_b32 s29, s0 +; GFX11-NEXT: s_mov_b32 s30, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v66, s31 :: v_dual_mov_b32 v65, s30 +; GFX11-NEXT: v_dual_mov_b32 v64, s29 :: v_dual_mov_b32 v63, s28 +; GFX11-NEXT: v_dual_mov_b32 v62, s27 :: v_dual_mov_b32 v61, s26 +; GFX11-NEXT: v_dual_mov_b32 v60, s25 :: v_dual_mov_b32 v59, s24 +; GFX11-NEXT: v_dual_mov_b32 v58, s23 :: v_dual_mov_b32 v57, s22 +; GFX11-NEXT: v_dual_mov_b32 v56, s21 :: v_dual_mov_b32 v55, s20 +; GFX11-NEXT: v_dual_mov_b32 v54, s19 :: v_dual_mov_b32 v53, s18 +; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v51, s16 +; GFX11-NEXT: v_dual_mov_b32 v50, s15 :: v_dual_mov_b32 v49, s14 +; GFX11-NEXT: v_dual_mov_b32 v48, s13 :: v_dual_mov_b32 v47, s12 +; GFX11-NEXT: v_dual_mov_b32 v46, s11 :: v_dual_mov_b32 v45, s10 +; GFX11-NEXT: v_dual_mov_b32 v44, s9 :: v_dual_mov_b32 v43, s8 +; GFX11-NEXT: v_dual_mov_b32 v42, s7 :: v_dual_mov_b32 v41, s6 +; GFX11-NEXT: v_dual_mov_b32 v40, s5 :: v_dual_mov_b32 v39, s4 +; GFX11-NEXT: v_dual_mov_b32 v38, s3 :: v_dual_mov_b32 v37, s2 +; GFX11-NEXT: v_dual_mov_b32 v36, s1 :: v_dual_mov_b32 v35, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB144_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33 +; GFX11-NEXT: v_dual_mov_b32 v64, v32 :: v_dual_mov_b32 v63, v31 +; GFX11-NEXT: v_dual_mov_b32 v62, v30 :: v_dual_mov_b32 v61, v29 +; GFX11-NEXT: v_dual_mov_b32 v60, v28 :: v_dual_mov_b32 v59, v27 +; GFX11-NEXT: v_dual_mov_b32 v58, v26 :: v_dual_mov_b32 v57, v25 +; GFX11-NEXT: v_dual_mov_b32 v56, v24 :: v_dual_mov_b32 v55, v23 +; GFX11-NEXT: v_dual_mov_b32 v54, v22 :: v_dual_mov_b32 v53, v21 +; GFX11-NEXT: v_dual_mov_b32 v52, v20 :: v_dual_mov_b32 v51, v19 +; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v49, v17 +; GFX11-NEXT: v_dual_mov_b32 v48, v16 :: v_dual_mov_b32 v47, v15 +; GFX11-NEXT: v_dual_mov_b32 v46, v14 :: v_dual_mov_b32 v45, v13 +; GFX11-NEXT: v_dual_mov_b32 v44, v12 :: v_dual_mov_b32 v43, v11 +; GFX11-NEXT: v_dual_mov_b32 v42, v10 :: v_dual_mov_b32 v41, v9 +; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7 +; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5 +; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3 +; GFX11-NEXT: .LBB144_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112 +; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:96 +; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:80 +; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:64 +; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:48 +; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:32 +; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:16 +; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 +; GFX11-NEXT: v_readlane_b32 s30, v67, 0 +; GFX11-NEXT: v_readlane_b32 s31, v67, 1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:80 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %if, label %end diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll index a3f27ebddf9d6..c9a4379a6dfcd 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll @@ -19,8 +19,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; GCN-ALLOCA: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, v{{[0-9]+}}, v0 ; GCN-PROMOTE: s_cmp_eq_u32 [[IN]], 1 -; GCN-PROMOTE-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-PROMOTE-NEXT: v_addc_u32_e32 [[RESULT:v[0-9]+]], vcc, 0, v0, vcc +; GCN-PROMOTE-NEXT: s_cselect_b32 [[SCC:s[0-9]+]], 1, 0 +; GCN-PROMOTE-NEXT: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, [[SCC]], v0 ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @work_item_info(ptr addrspace(1) %out, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll index c96ef12936573..43c21e4b79e35 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll @@ -240,7 +240,7 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) { } -attributes #0 = { "amdgpu-no-agpr" } +attributes #0 = { "amdgpu-agpr-alloc"="0" } ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } @@ -248,5 +248,5 @@ attributes #0 = { "amdgpu-no-agpr" } ; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } ; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" } ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-agpr" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll new file mode 100644 index 0000000000000..bfc35d8c76e37 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll @@ -0,0 +1,28 @@ +; REQUIRES: amdgpu-registered-target + +; RUN: not opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' < %s 2>&1 | FileCheck %s + +; CHECK: error:{{.*}}in function kernel void (ptr addrspace(1), i32, ptr addrspace(1)): Impossible to constant fold feature predicate: @llvm.amdgcn.is.gfx803 = private addrspace(1) constant i1 false used by %call = call i1 %1(i1 zeroext false), please simplify. + +@llvm.amdgcn.is.gfx803 = external addrspace(1) externally_initialized constant i1 + +declare void @llvm.amdgcn.s.sleep(i32 immarg) #1 + +define amdgpu_kernel void @kernel(ptr addrspace(1) readnone captures(none) %p.coerce, i32 %x, ptr addrspace(1) %pfn.coerce) { +entry: + %0 = ptrtoint ptr addrspace(1) %pfn.coerce to i64 + %1 = inttoptr i64 %0 to ptr + %2 = ptrtoint ptr addrspace(1) %pfn.coerce to i64 + %3 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx803, align 1 + %call = call i1 %1(i1 zeroext %3) + br i1 %call, label %if.gfx803, label %if.end + +if.gfx803: + call void @llvm.amdgcn.s.sleep(i32 0) + br label %if.end + +if.end: + ret void +} + +attributes #1 = { nocallback nofree nosync nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll new file mode 100644 index 0000000000000..a16a7fc31da22 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll @@ -0,0 +1,284 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; REQUIRES: amdgpu-registered-target + +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1010 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1101 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201-W64 %s + +;; The IR was derived from the following source: +;; extern "C" __global__ void kernel(int* p, int x) +;; { +;; if (__builtin_amdgcn_processor_is("gfx1201") || +;; __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) +;; __builtin_amdgcn_s_sleep_var(x); +;; if (!__builtin_amdgcn_processor_is("gfx906")) +;; __builtin_amdgcn_s_wait_event_export_ready(); +;; else if (__builtin_amdgcn_processor_is("gfx1010") || +;; __builtin_amdgcn_processor_is("gfx1101")) +;; __builtin_amdgcn_s_ttracedata_imm(1); +;; while (__builtin_amdgcn_processor_is("gfx1101")) *p += x; +;; do { +;; *p -= x; +;; } while (__builtin_amdgcn_processor_is("gfx1010")); +;; for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break; +;; +;; if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)) +;; __builtin_amdgcn_s_wait_event_export_ready(); +;; else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm)) +;; __builtin_amdgcn_s_ttracedata_imm(1); +;; +;; do { +;; *p -= x; +;; } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32)); +;; for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break; +;; } + +@llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.has.gfx12-insts = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx1010 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx1101 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.has.gfx11-insts = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1 +@"llvm.amdgcn.has.gfx12-insts,wavefrontsize64" = external addrspace(1) externally_initialized constant i1 + +declare void @llvm.amdgcn.s.sleep.var(i32) +declare void @llvm.amdgcn.s.wait.event.export.ready() +declare void @llvm.amdgcn.s.ttracedata.imm(i16 immarg) + +define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { +; GFX906-LABEL: define amdgpu_kernel void @kernel( +; GFX906-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[ENTRY:.*:]] +; GFX906-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX906-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX906-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] +; GFX906: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX906-NEXT: br label %[[IF_NOT_GFX907:.*]] +; GFX906: [[IF_NOT_GFX907]]: +; GFX906-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] +; GFX906: [[IF_GFX1010_OR_GFX1101]]: +; GFX906-NEXT: br label %[[LOR_NOT_GFX1010:.*]] +; GFX906: [[LOR_NOT_GFX1010]]: +; GFX906-NEXT: br label %[[FOR_COND:.*]] +; GFX906: [[FOR_COND]]: +; GFX906-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX906-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX906-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX906-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX906: [[IF_GFX11_INSTS]]: +; GFX906-NEXT: br label %[[IF_GFX10_INSTS:.*]] +; GFX906: [[IF_GFX10_INSTS]]: +; GFX906-NEXT: call void @llvm.assume(i1 true) +; GFX906-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX906-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX906-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX906-NEXT: ret void +; +; GFX1010-LABEL: define amdgpu_kernel void @kernel( +; GFX1010-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1010-NEXT: [[ENTRY:.*:]] +; GFX1010-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1010-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1010-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] +; GFX1010: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX1010-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX1010: [[IF_NOT_GFX906]]: +; GFX1010-NEXT: br label %[[LOR_NOT_GFX1010:.*]] +; GFX1010: [[LOR_NOT_GFX1010]]: +; GFX1010-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1010-NEXT: br label %[[IF_END6:.*]] +; GFX1010: [[IF_END6]]: +; GFX1010-NEXT: call void @llvm.assume(i1 true) +; GFX1010-NEXT: call void @llvm.assume(i1 true) +; GFX1010-NEXT: br label %[[FOR_COND:.*]] +; GFX1010: [[FOR_COND]]: +; GFX1010-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1010-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1010-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1010-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1010: [[IF_GFX11_INSTS]]: +; GFX1010-NEXT: br label %[[IF_GFX10_INSTS:.*]] +; GFX1010: [[IF_GFX10_INSTS]]: +; GFX1010-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1010-NEXT: br label %[[IF_END11:.*]] +; GFX1010: [[IF_END11]]: +; GFX1010-NEXT: call void @llvm.assume(i1 true) +; GFX1010-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1010-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1010-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1010-NEXT: ret void +; +; GFX1101-LABEL: define amdgpu_kernel void @kernel( +; GFX1101-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1101-NEXT: [[ENTRY:.*:]] +; GFX1101-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1101-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1101-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] +; GFX1101: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX1101-NEXT: br label %[[IF_END:.*]] +; GFX1101: [[IF_END]]: +; GFX1101-NEXT: br label %[[IF_NOT_GFX907:.*]] +; GFX1101: [[IF_NOT_GFX907]]: +; GFX1101-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1101-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX1101: [[IF_NOT_GFX906]]: +; GFX1101-NEXT: call void @llvm.assume(i1 true) +; GFX1101-NEXT: call void @llvm.assume(i1 true) +; GFX1101-NEXT: br label %[[FOR_COND:.*]] +; GFX1101: [[FOR_COND]]: +; GFX1101-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1101-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1101-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1101-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1101: [[IF_GFX11_INSTS]]: +; GFX1101-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1101-NEXT: br label %[[IF_ELSE8:.*]] +; GFX1101: [[IF_ELSE8]]: +; GFX1101-NEXT: call void @llvm.assume(i1 true) +; GFX1101-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1101-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1101-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1101-NEXT: ret void +; +; GFX1201-LABEL: define amdgpu_kernel void @kernel( +; GFX1201-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1201-NEXT: [[ENTRY:.*:]] +; GFX1201-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1201-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1201-NEXT: br label %[[LOR_NOT_GFX1201:.*]] +; GFX1201: [[LOR_NOT_GFX1201]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX1201-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX1201: [[IF_NOT_GFX906]]: +; GFX1201-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] +; GFX1201: [[IF_GFX1010_OR_GFX1101]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-NEXT: br label %[[IF_END6:.*]] +; GFX1201: [[IF_END6]]: +; GFX1201-NEXT: call void @llvm.assume(i1 true) +; GFX1201-NEXT: call void @llvm.assume(i1 true) +; GFX1201-NEXT: br label %[[FOR_COND:.*]] +; GFX1201: [[FOR_COND]]: +; GFX1201-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1201-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1201: [[IF_GFX11_INSTS]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-NEXT: br label %[[IF_ELSE8:.*]] +; GFX1201: [[IF_ELSE8]]: +; GFX1201-NEXT: call void @llvm.assume(i1 true) +; GFX1201-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1201-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-NEXT: ret void +; +; GFX1201-W64-LABEL: define amdgpu_kernel void @kernel( +; GFX1201-W64-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1201-W64-NEXT: [[ENTRY:.*:]] +; GFX1201-W64-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1201-W64-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1201-W64-NEXT: br label %[[LOR_NOT_GFX1201:.*]] +; GFX1201-W64: [[LOR_NOT_GFX1201]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX1201-W64-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX1201-W64: [[IF_NOT_GFX906]]: +; GFX1201-W64-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] +; GFX1201-W64: [[IF_GFX1010_OR_GFX1101]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-W64-NEXT: br label %[[IF_END6:.*]] +; GFX1201-W64: [[IF_END6]]: +; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) +; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) +; GFX1201-W64-NEXT: br label %[[FOR_COND:.*]] +; GFX1201-W64: [[FOR_COND]]: +; GFX1201-W64-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1201-W64-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1201-W64: [[IF_GFX11_INSTS]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-W64-NEXT: br label %[[IF_ELSE8:.*]] +; GFX1201-W64: [[IF_ELSE8]]: +; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) +; GFX1201-W64-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1201-W64-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: ret void +; +entry: + %0 = ptrtoint ptr addrspace(1) %p.coerce to i64 + %1 = inttoptr i64 %0 to ptr + %2 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1 + br i1 %2, label %if.gfx1201.or.gfx12-insts, label %lor.not.gfx1201 + +lor.not.gfx1201: + %3 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx12-insts, align 1 + br i1 %3, label %if.gfx1201.or.gfx12-insts, label %if.end + +if.gfx1201.or.gfx12-insts: + call void @llvm.amdgcn.s.sleep.var(i32 %x) + br label %if.end + +if.end: + %4 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1 + br i1 %4, label %if.gfx906, label %if.not.gfx906 + +if.not.gfx906: + call void @llvm.amdgcn.s.wait.event.export.ready() + br label %if.end6 + +if.gfx906: + %5 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1 + br i1 %5, label %if.gfx1010.or.gfx1101, label %lor.not.gfx1010 + +lor.not.gfx1010: + %6 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1 + br i1 %6, label %if.gfx1010.or.gfx1101, label %for.cond + +if.gfx1010.or.gfx1101: + call void @llvm.amdgcn.s.ttracedata.imm(i16 1) + br label %if.end6 + +if.end6: + %.pr.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1 + %7 = icmp ne i1 %.pr.pr, true + call void @llvm.assume(i1 %7) + %.pr6.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1 + %8 = icmp ne i1 %.pr6.pr, true + call void @llvm.assume(i1 %8) + br label %for.cond + +for.cond: + %.promoted = load i32, ptr %1, align 4 + %sub.peel = sub nsw i32 %.promoted, %x + store i32 %sub.peel, ptr %1, align 4 + %9 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx11-insts, align 1 + br i1 %9, label %if.gfx11-insts, label %if.else8 + +if.gfx11-insts: + call void @llvm.amdgcn.s.wait.event.export.ready() + br label %if.end11 + +if.else8: + %10 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1 + br i1 %10, label %if.gfx10-insts, label %if.end11 + +if.gfx10-insts: + call void @llvm.amdgcn.s.ttracedata.imm(i16 1) + br label %if.end11 + +if.end11: + %.pr7 = load i1, ptr addrspace(1) @"llvm.amdgcn.has.gfx12-insts,wavefrontsize64", align 1 + %11 = icmp ne i1 %.pr7, true + call void @llvm.assume(i1 %11) + %.promoted9 = load i32, ptr %1, align 4 + %sub13.peel = sub nsw i32 %.promoted9, %x + store i32 %sub13.peel, ptr %1, align 4 + ret void +} + +declare void @llvm.assume(i1 noundef) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare-crash-splat.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare-crash-splat.ll new file mode 100644 index 0000000000000..b8464c37a5dc2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare-crash-splat.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-late-codegenprepare %s | FileCheck %s + +; This crashed because the PHI with a splat was rejected, but then we marked the PHI +; as visited and tried to convert one of its user afterwards. + +define amdgpu_kernel void @widget(ptr %arg, ptr %arg1, ptr %arg2) { +; CHECK-LABEL: define amdgpu_kernel void @widget( +; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i1 +; CHECK-NEXT: [[ARG1_LOAD:%.*]] = load <4 x i8>, ptr [[ARG1]], align 4 +; CHECK-NEXT: [[ARG2_LOAD:%.*]] = load i64, ptr [[ARG2]], align 4 +; CHECK-NEXT: br label %[[BB_1:.*]] +; CHECK: [[BB_1]]: +; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ null, %[[BB]] ], [ [[ARG1]], %[[BB_6:.*]] ] +; CHECK-NEXT: [[PHI4:%.*]] = phi <4 x i8> [ splat (i8 1), %[[BB]] ], [ [[PHI15:%.*]], %[[BB_6]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_2:.*]], label %[[BB_6]] +; CHECK: [[BB_2]]: +; CHECK-NEXT: [[PHI7:%.*]] = phi <4 x i8> [ [[PHI13:%.*]], %[[BB_5:.*]] ], [ [[PHI4]], %[[BB_1]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_4:.*]], label %[[BB_5]] +; CHECK: [[BB_3:.*]]: +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_4]], label %[[BB_EXIT:.*]] +; CHECK: [[BB_4]]: +; CHECK-NEXT: [[PHI11:%.*]] = phi <4 x i8> [ [[PHI7]], %[[BB_3]] ], [ zeroinitializer, %[[BB_2]] ] +; CHECK-NEXT: store <4 x i8> [[PHI11]], ptr [[PHI]], align 1 +; CHECK-NEXT: br label %[[BB_5]] +; CHECK: [[BB_5]]: +; CHECK-NEXT: [[PHI13]] = phi <4 x i8> [ zeroinitializer, %[[BB_4]] ], [ [[PHI7]], %[[BB_2]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_2]], label %[[BB_6]] +; CHECK: [[BB_6]]: +; CHECK-NEXT: [[PHI15]] = phi <4 x i8> [ [[ARG1_LOAD]], %[[BB_1]] ], [ zeroinitializer, %[[BB_5]] ] +; CHECK-NEXT: br label %[[BB_1]] +; CHECK: [[BB_EXIT]]: +; CHECK-NEXT: ret void +; +bb: + %ld = load i32, ptr %arg, align 4 + %ld.trunc = trunc i32 %ld to i1 + %arg1.load = load <4 x i8>, ptr %arg1, align 4 + %arg2.load = load i64, ptr %arg2, align 4 + br label %bb.1 + +bb.1: + %phi = phi ptr [ null, %bb ], [ %arg1, %bb.6 ] + %phi4 = phi <4 x i8> [ splat (i8 1), %bb ], [ %phi15, %bb.6 ] + br i1 %ld.trunc, label %bb.2, label %bb.6 + +bb.2: + %phi7 = phi <4 x i8> [ %phi13, %bb.5 ], [ %phi4, %bb.1 ] + br i1 %ld.trunc, label %bb.4, label %bb.5 + +bb.3: + br i1 %ld.trunc, label %bb.4, label %bb.exit + +bb.4: + %phi11 = phi <4 x i8> [ %phi7, %bb.3 ], [ zeroinitializer, %bb.2 ] + store <4 x i8> %phi11, ptr %phi, align 1 + br label %bb.5 + +bb.5: + %phi13 = phi <4 x i8> [ zeroinitializer, %bb.4 ], [ %phi7, %bb.2 ] + br i1 %ld.trunc, label %bb.2, label %bb.6 + +bb.6: + %phi15 = phi <4 x i8> [ %arg1.load, %bb.1 ], [ zeroinitializer, %bb.5 ] + br label %bb.1 + +bb.exit: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll index 966466181ef47..1627bb7a7671c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -82,17 +82,16 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) { ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 ; CHECK-NEXT: s_mov_b32 s4, 0x800000 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc ; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1 -; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3 ; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3 ; CHECK-NEXT: v_log_f32_e32 v3, v3 -; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 +; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2 -; CHECK-NEXT: v_mul_f32_e32 v3, v2, v4 ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 +; CHECK-NEXT: v_mul_f32_e32 v3, v2, v4 ; CHECK-NEXT: v_mov_b32_e32 v5, 0x42800000 ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc @@ -129,28 +128,28 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: v_writelane_b32 v43, s37, 3 ; CHECK-NEXT: v_writelane_b32 v43, s38, 4 ; CHECK-NEXT: v_writelane_b32 v43, s39, 5 -; CHECK-NEXT: v_writelane_b32 v43, s40, 6 -; CHECK-NEXT: v_writelane_b32 v43, s41, 7 -; CHECK-NEXT: v_writelane_b32 v43, s42, 8 -; CHECK-NEXT: v_writelane_b32 v43, s43, 9 -; CHECK-NEXT: v_writelane_b32 v43, s44, 10 -; CHECK-NEXT: v_writelane_b32 v43, s45, 11 +; CHECK-NEXT: v_writelane_b32 v43, s48, 6 +; CHECK-NEXT: v_writelane_b32 v43, s49, 7 +; CHECK-NEXT: v_writelane_b32 v43, s50, 8 +; CHECK-NEXT: v_writelane_b32 v43, s51, 9 +; CHECK-NEXT: v_writelane_b32 v43, s52, 10 +; CHECK-NEXT: v_writelane_b32 v43, s53, 11 ; CHECK-NEXT: v_writelane_b32 v43, s30, 12 ; CHECK-NEXT: v_writelane_b32 v43, s31, 13 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v42, v1 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: v_mov_b32_e32 v41, v2 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] @@ -161,15 +160,15 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -181,12 +180,12 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: v_readlane_b32 s30, v43, 12 ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_readlane_b32 s31, v43, 13 -; CHECK-NEXT: v_readlane_b32 s45, v43, 11 -; CHECK-NEXT: v_readlane_b32 s44, v43, 10 -; CHECK-NEXT: v_readlane_b32 s43, v43, 9 -; CHECK-NEXT: v_readlane_b32 s42, v43, 8 -; CHECK-NEXT: v_readlane_b32 s41, v43, 7 -; CHECK-NEXT: v_readlane_b32 s40, v43, 6 +; CHECK-NEXT: v_readlane_b32 s53, v43, 11 +; CHECK-NEXT: v_readlane_b32 s52, v43, 10 +; CHECK-NEXT: v_readlane_b32 s51, v43, 9 +; CHECK-NEXT: v_readlane_b32 s50, v43, 8 +; CHECK-NEXT: v_readlane_b32 s49, v43, 7 +; CHECK-NEXT: v_readlane_b32 s48, v43, 6 ; CHECK-NEXT: v_readlane_b32 s39, v43, 5 ; CHECK-NEXT: v_readlane_b32 s38, v43, 4 ; CHECK-NEXT: v_readlane_b32 s37, v43, 3 @@ -228,8 +227,7 @@ define float @test_powr_fast_f32(float %x, float %y) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s4, 0x800000 ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc ; CHECK-NEXT: v_ldexp_f32 v0, v0, v3 ; CHECK-NEXT: v_log_f32_e32 v0, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -270,27 +268,27 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: v_writelane_b32 v43, s37, 3 ; CHECK-NEXT: v_writelane_b32 v43, s38, 4 ; CHECK-NEXT: v_writelane_b32 v43, s39, 5 -; CHECK-NEXT: v_writelane_b32 v43, s40, 6 -; CHECK-NEXT: v_writelane_b32 v43, s41, 7 -; CHECK-NEXT: v_writelane_b32 v43, s42, 8 -; CHECK-NEXT: v_writelane_b32 v43, s43, 9 -; CHECK-NEXT: v_writelane_b32 v43, s44, 10 -; CHECK-NEXT: v_writelane_b32 v43, s45, 11 +; CHECK-NEXT: v_writelane_b32 v43, s48, 6 +; CHECK-NEXT: v_writelane_b32 v43, s49, 7 +; CHECK-NEXT: v_writelane_b32 v43, s50, 8 +; CHECK-NEXT: v_writelane_b32 v43, s51, 9 +; CHECK-NEXT: v_writelane_b32 v43, s52, 10 +; CHECK-NEXT: v_writelane_b32 v43, s53, 11 ; CHECK-NEXT: v_writelane_b32 v43, s30, 12 ; CHECK-NEXT: v_writelane_b32 v43, s31, 13 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mov_b32_e32 v42, v31 ; CHECK-NEXT: v_mov_b32_e32 v41, v3 ; CHECK-NEXT: v_mov_b32_e32 v40, v2 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] @@ -301,14 +299,14 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 ; CHECK-NEXT: v_mov_b32_e32 v31, v42 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -317,12 +315,12 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: v_readlane_b32 s30, v43, 12 ; CHECK-NEXT: v_readlane_b32 s31, v43, 13 -; CHECK-NEXT: v_readlane_b32 s45, v43, 11 -; CHECK-NEXT: v_readlane_b32 s44, v43, 10 -; CHECK-NEXT: v_readlane_b32 s43, v43, 9 -; CHECK-NEXT: v_readlane_b32 s42, v43, 8 -; CHECK-NEXT: v_readlane_b32 s41, v43, 7 -; CHECK-NEXT: v_readlane_b32 s40, v43, 6 +; CHECK-NEXT: v_readlane_b32 s53, v43, 11 +; CHECK-NEXT: v_readlane_b32 s52, v43, 10 +; CHECK-NEXT: v_readlane_b32 s51, v43, 9 +; CHECK-NEXT: v_readlane_b32 s50, v43, 8 +; CHECK-NEXT: v_readlane_b32 s49, v43, 7 +; CHECK-NEXT: v_readlane_b32 s48, v43, 6 ; CHECK-NEXT: v_readlane_b32 s39, v43, 5 ; CHECK-NEXT: v_readlane_b32 s38, v43, 4 ; CHECK-NEXT: v_readlane_b32 s37, v43, 3 @@ -368,8 +366,7 @@ define float @test_pown_fast_f32(float %x, i32 %y) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s4, 0x800000 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc ; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3 ; CHECK-NEXT: v_log_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1 @@ -413,28 +410,28 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: v_writelane_b32 v43, s37, 3 ; CHECK-NEXT: v_writelane_b32 v43, s38, 4 ; CHECK-NEXT: v_writelane_b32 v43, s39, 5 -; CHECK-NEXT: v_writelane_b32 v43, s40, 6 -; CHECK-NEXT: v_writelane_b32 v43, s41, 7 -; CHECK-NEXT: v_writelane_b32 v43, s42, 8 -; CHECK-NEXT: v_writelane_b32 v43, s43, 9 -; CHECK-NEXT: v_writelane_b32 v43, s44, 10 -; CHECK-NEXT: v_writelane_b32 v43, s45, 11 +; CHECK-NEXT: v_writelane_b32 v43, s48, 6 +; CHECK-NEXT: v_writelane_b32 v43, s49, 7 +; CHECK-NEXT: v_writelane_b32 v43, s50, 8 +; CHECK-NEXT: v_writelane_b32 v43, s51, 9 +; CHECK-NEXT: v_writelane_b32 v43, s52, 10 +; CHECK-NEXT: v_writelane_b32 v43, s53, 11 ; CHECK-NEXT: v_writelane_b32 v43, s30, 12 ; CHECK-NEXT: v_writelane_b32 v43, s31, 13 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v42, v1 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: v_mov_b32_e32 v41, v2 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] @@ -445,15 +442,15 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -465,12 +462,12 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: v_readlane_b32 s30, v43, 12 ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_readlane_b32 s31, v43, 13 -; CHECK-NEXT: v_readlane_b32 s45, v43, 11 -; CHECK-NEXT: v_readlane_b32 s44, v43, 10 -; CHECK-NEXT: v_readlane_b32 s43, v43, 9 -; CHECK-NEXT: v_readlane_b32 s42, v43, 8 -; CHECK-NEXT: v_readlane_b32 s41, v43, 7 -; CHECK-NEXT: v_readlane_b32 s40, v43, 6 +; CHECK-NEXT: v_readlane_b32 s53, v43, 11 +; CHECK-NEXT: v_readlane_b32 s52, v43, 10 +; CHECK-NEXT: v_readlane_b32 s51, v43, 9 +; CHECK-NEXT: v_readlane_b32 s50, v43, 8 +; CHECK-NEXT: v_readlane_b32 s49, v43, 7 +; CHECK-NEXT: v_readlane_b32 s48, v43, 6 ; CHECK-NEXT: v_readlane_b32 s39, v43, 5 ; CHECK-NEXT: v_readlane_b32 s38, v43, 4 ; CHECK-NEXT: v_readlane_b32 s37, v43, 3 @@ -511,8 +508,7 @@ define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s4, 0x800000 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc ; CHECK-NEXT: v_ldexp_f32 v0, |v0|, v3 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; CHECK-NEXT: v_log_f32_e32 v0, v0 @@ -555,26 +551,26 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: v_writelane_b32 v42, s37, 3 ; CHECK-NEXT: v_writelane_b32 v42, s38, 4 ; CHECK-NEXT: v_writelane_b32 v42, s39, 5 -; CHECK-NEXT: v_writelane_b32 v42, s40, 6 -; CHECK-NEXT: v_writelane_b32 v42, s41, 7 -; CHECK-NEXT: v_writelane_b32 v42, s42, 8 -; CHECK-NEXT: v_writelane_b32 v42, s43, 9 -; CHECK-NEXT: v_writelane_b32 v42, s44, 10 -; CHECK-NEXT: v_writelane_b32 v42, s45, 11 +; CHECK-NEXT: v_writelane_b32 v42, s48, 6 +; CHECK-NEXT: v_writelane_b32 v42, s49, 7 +; CHECK-NEXT: v_writelane_b32 v42, s50, 8 +; CHECK-NEXT: v_writelane_b32 v42, s51, 9 +; CHECK-NEXT: v_writelane_b32 v42, s52, 10 +; CHECK-NEXT: v_writelane_b32 v42, s53, 11 ; CHECK-NEXT: v_writelane_b32 v42, s30, 12 ; CHECK-NEXT: v_writelane_b32 v42, s31, 13 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] @@ -586,15 +582,15 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -602,12 +598,12 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: v_readlane_b32 s30, v42, 12 ; CHECK-NEXT: v_readlane_b32 s31, v42, 13 -; CHECK-NEXT: v_readlane_b32 s45, v42, 11 -; CHECK-NEXT: v_readlane_b32 s44, v42, 10 -; CHECK-NEXT: v_readlane_b32 s43, v42, 9 -; CHECK-NEXT: v_readlane_b32 s42, v42, 8 -; CHECK-NEXT: v_readlane_b32 s41, v42, 7 -; CHECK-NEXT: v_readlane_b32 s40, v42, 6 +; CHECK-NEXT: v_readlane_b32 s53, v42, 11 +; CHECK-NEXT: v_readlane_b32 s52, v42, 10 +; CHECK-NEXT: v_readlane_b32 s51, v42, 9 +; CHECK-NEXT: v_readlane_b32 s50, v42, 8 +; CHECK-NEXT: v_readlane_b32 s49, v42, 7 +; CHECK-NEXT: v_readlane_b32 s48, v42, 6 ; CHECK-NEXT: v_readlane_b32 s39, v42, 5 ; CHECK-NEXT: v_readlane_b32 s38, v42, 4 ; CHECK-NEXT: v_readlane_b32 s37, v42, 3 @@ -651,8 +647,7 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s4, 0x800000 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc ; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3 ; CHECK-NEXT: v_or_b32_e32 v1, 1, v1 ; CHECK-NEXT: v_log_f32_e32 v3, v3 @@ -698,27 +693,27 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: v_writelane_b32 v43, s37, 3 ; CHECK-NEXT: v_writelane_b32 v43, s38, 4 ; CHECK-NEXT: v_writelane_b32 v43, s39, 5 -; CHECK-NEXT: v_writelane_b32 v43, s40, 6 -; CHECK-NEXT: v_writelane_b32 v43, s41, 7 -; CHECK-NEXT: v_writelane_b32 v43, s42, 8 -; CHECK-NEXT: v_writelane_b32 v43, s43, 9 -; CHECK-NEXT: v_writelane_b32 v43, s44, 10 -; CHECK-NEXT: v_writelane_b32 v43, s45, 11 +; CHECK-NEXT: v_writelane_b32 v43, s48, 6 +; CHECK-NEXT: v_writelane_b32 v43, s49, 7 +; CHECK-NEXT: v_writelane_b32 v43, s50, 8 +; CHECK-NEXT: v_writelane_b32 v43, s51, 9 +; CHECK-NEXT: v_writelane_b32 v43, s52, 10 +; CHECK-NEXT: v_writelane_b32 v43, s53, 11 ; CHECK-NEXT: v_writelane_b32 v43, s30, 12 ; CHECK-NEXT: v_writelane_b32 v43, s31, 13 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v41, v1 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v41 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] @@ -730,15 +725,15 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -749,12 +744,12 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: v_readlane_b32 s30, v43, 12 ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_readlane_b32 s31, v43, 13 -; CHECK-NEXT: v_readlane_b32 s45, v43, 11 -; CHECK-NEXT: v_readlane_b32 s44, v43, 10 -; CHECK-NEXT: v_readlane_b32 s43, v43, 9 -; CHECK-NEXT: v_readlane_b32 s42, v43, 8 -; CHECK-NEXT: v_readlane_b32 s41, v43, 7 -; CHECK-NEXT: v_readlane_b32 s40, v43, 6 +; CHECK-NEXT: v_readlane_b32 s53, v43, 11 +; CHECK-NEXT: v_readlane_b32 s52, v43, 10 +; CHECK-NEXT: v_readlane_b32 s51, v43, 9 +; CHECK-NEXT: v_readlane_b32 s50, v43, 8 +; CHECK-NEXT: v_readlane_b32 s49, v43, 7 +; CHECK-NEXT: v_readlane_b32 s48, v43, 6 ; CHECK-NEXT: v_readlane_b32 s39, v43, 5 ; CHECK-NEXT: v_readlane_b32 s38, v43, 4 ; CHECK-NEXT: v_readlane_b32 s37, v43, 3 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll index 0d4f29e0c4f92..c804c75ae7d2c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll @@ -1,56 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-spill-cfi-saved-regs -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,WAVE64 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-spill-cfi-saved-regs -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,WAVE32 %s -; CHECK-LABEL: kern: -; CHECK: .cfi_startproc -; CHECK-NOT: .cfi_{{.*}} -; CHECK: %bb.0: -; CHECK-NEXT: .cfi_escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 -; CHECK-NEXT: .cfi_undefined 16 -; CHECK-NOT: .cfi_{{.*}} -; CHECK: .cfi_endproc define protected amdgpu_kernel void @kern() #0 { +; CHECK-LABEL: kern: +; CHECK: .Lfunc_begin0: +; CHECK-NEXT: .cfi_sections .debug_frame +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: ; %bb.0: ; %entry +; CHECK-NEXT: .cfi_escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 ; +; CHECK-NEXT: .cfi_undefined 16 +; CHECK-NEXT: s_endpgm entry: ret void } -; CHECK-LABEL: func_saved_in_clobbered_vgpr: -; CHECK: .cfi_startproc -; CHECK-NOT: .cfi_{{.*}} -; CHECK: %bb.0: -; SGPR32 = 64 -; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 -; CHECK-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 - -; WAVE64: v_writelane_b32 v0, exec_lo, 0 -; WAVE64-NEXT: v_writelane_b32 v0, exec_hi, 1 -; WAVE64-NEXT: .cfi_llvm_vector_registers 17, {{[0-9]+}}, {{[0-9]+}}, 32, {{[0-9]+}}, {{[0-9]+}}, 32 - -; WAVE32: v_writelane_b32 v0, exec_lo, 0 -; WAVE32-NEXT: .cfi_llvm_vector_registers 1, {{[0-9]+}}, {{[0-9]+}}, 32 - -; CHECK-NOT: .cfi_{{.*}} -; CHECK: .cfi_endproc define hidden void @func_saved_in_clobbered_vgpr() #0 { +; WAVE64-LABEL: func_saved_in_clobbered_vgpr: +; WAVE64: .Lfunc_begin1: +; WAVE64-NEXT: .cfi_startproc +; WAVE64-NEXT: ; %bb.0: ; %entry +; WAVE64-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE64-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_offset 2560, 0 +; WAVE64-NEXT: s_mov_b64 exec, s[4:5] +; WAVE64-NEXT: v_writelane_b32 v0, exec_lo, 0 +; WAVE64-NEXT: v_writelane_b32 v0, exec_hi, 1 +; WAVE64-NEXT: .cfi_llvm_vector_registers 17, 2560, 0, 32, 2560, 1, 32 +; WAVE64-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; WAVE64-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; WAVE64-NEXT: s_mov_b64 exec, s[4:5] +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: func_saved_in_clobbered_vgpr: +; WAVE32: .Lfunc_begin1: +; WAVE32-NEXT: .cfi_startproc +; WAVE32-NEXT: ; %bb.0: ; %entry +; WAVE32-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE32-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: s_xor_saveexec_b32 s4, -1 +; WAVE32-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_offset 1536, 0 +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, s4 +; WAVE32-NEXT: v_writelane_b32 v0, exec_lo, 0 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1, 1536, 0, 32 +; WAVE32-NEXT: s_xor_saveexec_b32 s4, -1 +; WAVE32-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, s4 +; WAVE32-NEXT: s_waitcnt vmcnt(0) +; WAVE32-NEXT: s_setpc_b64 s[30:31] entry: ret void } ; Check that the option causes a CSR VGPR to spill when needed. - -; CHECK-LABEL: func_saved_in_preserved_vgpr: -; CHECK: %bb.0: - -; CHECK: s_or_saveexec_b{{(32|64)}} -; CHECK: buffer_store_dword [[CSR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK: s_mov_b{{(32|64)}} {{(exec|exec_lo)}}, - -; WAVE64: v_writelane_b32 [[CSR]], exec_lo, {{[0-9]+}} -; WAVE64-NEXT: v_writelane_b32 [[CSR]], exec_hi, {{[0-9]+}} - -; WAVE32: v_writelane_b32 [[CSR]], exec_lo, {{[0-9]+}} - define hidden void @func_saved_in_preserved_vgpr() #0 { +; WAVE64-LABEL: func_saved_in_preserved_vgpr: +; WAVE64: .Lfunc_begin2: +; WAVE64-NEXT: .cfi_startproc +; WAVE64-NEXT: ; %bb.0: ; %entry +; WAVE64-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE64-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE64-NEXT: .cfi_undefined 2560 +; WAVE64-NEXT: .cfi_undefined 2561 +; WAVE64-NEXT: .cfi_undefined 2562 +; WAVE64-NEXT: .cfi_undefined 2563 +; WAVE64-NEXT: .cfi_undefined 2564 +; WAVE64-NEXT: .cfi_undefined 2565 +; WAVE64-NEXT: .cfi_undefined 2566 +; WAVE64-NEXT: .cfi_undefined 2567 +; WAVE64-NEXT: .cfi_undefined 2568 +; WAVE64-NEXT: .cfi_undefined 2569 +; WAVE64-NEXT: .cfi_undefined 2570 +; WAVE64-NEXT: .cfi_undefined 2571 +; WAVE64-NEXT: .cfi_undefined 2572 +; WAVE64-NEXT: .cfi_undefined 2573 +; WAVE64-NEXT: .cfi_undefined 2574 +; WAVE64-NEXT: .cfi_undefined 2575 +; WAVE64-NEXT: .cfi_undefined 2576 +; WAVE64-NEXT: .cfi_undefined 2577 +; WAVE64-NEXT: .cfi_undefined 2578 +; WAVE64-NEXT: .cfi_undefined 2579 +; WAVE64-NEXT: .cfi_undefined 2580 +; WAVE64-NEXT: .cfi_undefined 2581 +; WAVE64-NEXT: .cfi_undefined 2582 +; WAVE64-NEXT: .cfi_undefined 2583 +; WAVE64-NEXT: .cfi_undefined 2584 +; WAVE64-NEXT: .cfi_undefined 2585 +; WAVE64-NEXT: .cfi_undefined 2586 +; WAVE64-NEXT: .cfi_undefined 2587 +; WAVE64-NEXT: .cfi_undefined 2588 +; WAVE64-NEXT: .cfi_undefined 2589 +; WAVE64-NEXT: .cfi_undefined 2590 +; WAVE64-NEXT: .cfi_undefined 2591 +; WAVE64-NEXT: .cfi_undefined 2592 +; WAVE64-NEXT: .cfi_undefined 2593 +; WAVE64-NEXT: .cfi_undefined 2594 +; WAVE64-NEXT: .cfi_undefined 2595 +; WAVE64-NEXT: .cfi_undefined 2596 +; WAVE64-NEXT: .cfi_undefined 2597 +; WAVE64-NEXT: .cfi_undefined 2598 +; WAVE64-NEXT: .cfi_undefined 2599 +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; WAVE64-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_offset 2600, 0 +; WAVE64-NEXT: s_mov_b64 exec, s[4:5] +; WAVE64-NEXT: v_writelane_b32 v40, exec_lo, 0 +; WAVE64-NEXT: v_writelane_b32 v40, exec_hi, 1 +; WAVE64-NEXT: .cfi_llvm_vector_registers 17, 2600, 0, 32, 2600, 1, 32 +; WAVE64-NEXT: ;;#ASMSTART +; WAVE64-NEXT: ; clobber nonpreserved VGPRs +; WAVE64-NEXT: ;;#ASMEND +; WAVE64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; WAVE64-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; WAVE64-NEXT: s_mov_b64 exec, s[4:5] +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: func_saved_in_preserved_vgpr: +; WAVE32: .Lfunc_begin2: +; WAVE32-NEXT: .cfi_startproc +; WAVE32-NEXT: ; %bb.0: ; %entry +; WAVE32-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE32-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE32-NEXT: .cfi_undefined 1536 +; WAVE32-NEXT: .cfi_undefined 1537 +; WAVE32-NEXT: .cfi_undefined 1538 +; WAVE32-NEXT: .cfi_undefined 1539 +; WAVE32-NEXT: .cfi_undefined 1540 +; WAVE32-NEXT: .cfi_undefined 1541 +; WAVE32-NEXT: .cfi_undefined 1542 +; WAVE32-NEXT: .cfi_undefined 1543 +; WAVE32-NEXT: .cfi_undefined 1544 +; WAVE32-NEXT: .cfi_undefined 1545 +; WAVE32-NEXT: .cfi_undefined 1546 +; WAVE32-NEXT: .cfi_undefined 1547 +; WAVE32-NEXT: .cfi_undefined 1548 +; WAVE32-NEXT: .cfi_undefined 1549 +; WAVE32-NEXT: .cfi_undefined 1550 +; WAVE32-NEXT: .cfi_undefined 1551 +; WAVE32-NEXT: .cfi_undefined 1552 +; WAVE32-NEXT: .cfi_undefined 1553 +; WAVE32-NEXT: .cfi_undefined 1554 +; WAVE32-NEXT: .cfi_undefined 1555 +; WAVE32-NEXT: .cfi_undefined 1556 +; WAVE32-NEXT: .cfi_undefined 1557 +; WAVE32-NEXT: .cfi_undefined 1558 +; WAVE32-NEXT: .cfi_undefined 1559 +; WAVE32-NEXT: .cfi_undefined 1560 +; WAVE32-NEXT: .cfi_undefined 1561 +; WAVE32-NEXT: .cfi_undefined 1562 +; WAVE32-NEXT: .cfi_undefined 1563 +; WAVE32-NEXT: .cfi_undefined 1564 +; WAVE32-NEXT: .cfi_undefined 1565 +; WAVE32-NEXT: .cfi_undefined 1566 +; WAVE32-NEXT: .cfi_undefined 1567 +; WAVE32-NEXT: .cfi_undefined 1568 +; WAVE32-NEXT: .cfi_undefined 1569 +; WAVE32-NEXT: .cfi_undefined 1570 +; WAVE32-NEXT: .cfi_undefined 1571 +; WAVE32-NEXT: .cfi_undefined 1572 +; WAVE32-NEXT: .cfi_undefined 1573 +; WAVE32-NEXT: .cfi_undefined 1574 +; WAVE32-NEXT: .cfi_undefined 1575 +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: s_or_saveexec_b32 s4, -1 +; WAVE32-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_offset 1576, 0 +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, s4 +; WAVE32-NEXT: v_writelane_b32 v40, exec_lo, 0 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1, 1576, 0, 32 +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber nonpreserved VGPRs +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: s_or_saveexec_b32 s4, -1 +; WAVE32-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, s4 +; WAVE32-NEXT: s_waitcnt vmcnt(0) +; WAVE32-NEXT: s_setpc_b64 s[30:31] entry: call void asm sideeffect "; clobber nonpreserved VGPRs", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} @@ -61,25 +198,128 @@ entry: } ; There's no return here, so the return address live in was deleted. -; CHECK-LABEL: {{^}}empty_func: -; CHECK-NOT: v_writelane_b32 v0, s30, 0 -; CHECK-NOT: v_writelane_b32 v0, s31, 1 define void @empty_func() { +; WAVE64-LABEL: empty_func: +; WAVE64: .Lfunc_begin3: +; WAVE64-NEXT: .cfi_startproc +; WAVE64-NEXT: ; %bb.0: +; WAVE64-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE64-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_offset 2560, 0 +; WAVE64-NEXT: s_mov_b64 exec, s[4:5] +; WAVE64-NEXT: v_writelane_b32 v0, exec_lo, 0 +; WAVE64-NEXT: v_writelane_b32 v0, exec_hi, 1 +; +; WAVE32-LABEL: empty_func: +; WAVE32: .Lfunc_begin3: +; WAVE32-NEXT: .cfi_startproc +; WAVE32-NEXT: ; %bb.0: +; WAVE32-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE32-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: s_xor_saveexec_b32 s4, -1 +; WAVE32-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_offset 1536, 0 +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, s4 +; WAVE32-NEXT: v_writelane_b32 v0, exec_lo, 0 unreachable } ; Check that the option causes RA and EXEC to be spilled to memory. - -; CHECK-LABEL: no_vgprs_to_spill_into: -; CHECK: %bb.0: - -; WAVE64: v_mov_b32_e32 v0, exec_lo -; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; WAVE64-NEXT: v_mov_b32_e32 v0, exec_hi -; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; WAVE64-NEXT: .cfi_offset 17, 0 - define void @no_vgprs_to_spill_into() #1 { +; WAVE64-LABEL: no_vgprs_to_spill_into: +; WAVE64: .Lfunc_begin4: +; WAVE64-NEXT: .cfi_startproc +; WAVE64-NEXT: ; %bb.0: +; WAVE64-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE64-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE64-NEXT: .cfi_undefined 2560 +; WAVE64-NEXT: .cfi_undefined 2561 +; WAVE64-NEXT: .cfi_undefined 2562 +; WAVE64-NEXT: .cfi_undefined 2563 +; WAVE64-NEXT: .cfi_undefined 2564 +; WAVE64-NEXT: .cfi_undefined 2565 +; WAVE64-NEXT: .cfi_undefined 2566 +; WAVE64-NEXT: .cfi_undefined 2567 +; WAVE64-NEXT: .cfi_undefined 2568 +; WAVE64-NEXT: .cfi_undefined 2569 +; WAVE64-NEXT: .cfi_undefined 2570 +; WAVE64-NEXT: .cfi_undefined 2571 +; WAVE64-NEXT: .cfi_undefined 2572 +; WAVE64-NEXT: .cfi_undefined 2573 +; WAVE64-NEXT: .cfi_undefined 2574 +; WAVE64-NEXT: .cfi_undefined 2575 +; WAVE64-NEXT: .cfi_undefined 2576 +; WAVE64-NEXT: .cfi_undefined 2577 +; WAVE64-NEXT: .cfi_undefined 2578 +; WAVE64-NEXT: .cfi_undefined 2579 +; WAVE64-NEXT: .cfi_undefined 2580 +; WAVE64-NEXT: .cfi_undefined 2581 +; WAVE64-NEXT: .cfi_undefined 2582 +; WAVE64-NEXT: .cfi_undefined 2583 +; WAVE64-NEXT: .cfi_undefined 2584 +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: v_mov_b32_e32 v0, exec_lo +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE64-NEXT: v_mov_b32_e32 v0, exec_hi +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_offset 17, 0 +; WAVE64-NEXT: ;;#ASMSTART +; WAVE64-NEXT: ;;#ASMEND +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: no_vgprs_to_spill_into: +; WAVE32: .Lfunc_begin4: +; WAVE32-NEXT: .cfi_startproc +; WAVE32-NEXT: ; %bb.0: +; WAVE32-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE32-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE32-NEXT: .cfi_undefined 1536 +; WAVE32-NEXT: .cfi_undefined 1537 +; WAVE32-NEXT: .cfi_undefined 1538 +; WAVE32-NEXT: .cfi_undefined 1539 +; WAVE32-NEXT: .cfi_undefined 1540 +; WAVE32-NEXT: .cfi_undefined 1541 +; WAVE32-NEXT: .cfi_undefined 1542 +; WAVE32-NEXT: .cfi_undefined 1543 +; WAVE32-NEXT: .cfi_undefined 1544 +; WAVE32-NEXT: .cfi_undefined 1545 +; WAVE32-NEXT: .cfi_undefined 1546 +; WAVE32-NEXT: .cfi_undefined 1547 +; WAVE32-NEXT: .cfi_undefined 1548 +; WAVE32-NEXT: .cfi_undefined 1549 +; WAVE32-NEXT: .cfi_undefined 1550 +; WAVE32-NEXT: .cfi_undefined 1551 +; WAVE32-NEXT: .cfi_undefined 1552 +; WAVE32-NEXT: .cfi_undefined 1553 +; WAVE32-NEXT: .cfi_undefined 1554 +; WAVE32-NEXT: .cfi_undefined 1555 +; WAVE32-NEXT: .cfi_undefined 1556 +; WAVE32-NEXT: .cfi_undefined 1557 +; WAVE32-NEXT: .cfi_undefined 1558 +; WAVE32-NEXT: .cfi_undefined 1559 +; WAVE32-NEXT: .cfi_undefined 1560 +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: s_xor_saveexec_b32 s4, -1 +; WAVE32-NEXT: buffer_store_dword v25, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_offset 1561, 0 +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, s4 +; WAVE32-NEXT: v_writelane_b32 v25, exec_lo, 0 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1, 1561, 0, 32 +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: s_xor_saveexec_b32 s4, -1 +; WAVE32-NEXT: buffer_load_dword v25, off, s[0:3], s32 ; 4-byte Folded Reload +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, s4 +; WAVE32-NEXT: s_waitcnt vmcnt(0) +; WAVE32-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} @@ -90,34 +330,781 @@ define void @no_vgprs_to_spill_into() #1 { ; Check that the FP and EXEC needs to be spilled to memory, even though ; we have reserved VGPR but there are no available free lanes. - -; CHECK-LABEL: callee_need_to_spill_fp_exec_to_memory: -; CHECK: %bb.0: - -; WAVE32: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; WAVE32: s_xor_saveexec_b32 [[EXEC_COPY:s[0-9]+]], -1 -; WAVE32-NEXT: buffer_store_dword [[RES_VGPR:v[0-9]+]], off, s[0:3], s33 offset:192 ; 4-byte Folded Spill -; WAVE32: s_mov_b32 exec_lo, [[EXEC_COPY]] -; WAVE32-NEXT: v_mov_b32_e32 [[TEMP_VGPR:v[0-9]+]], exec_lo -; WAVE32-NEXT: buffer_store_dword [[TEMP_VGPR]], off, s[0:3], s33 offset:196 ; 4-byte Folded Spill -; WAVE32-NEXT: .cfi_offset 1, 6272 -; WAVE32-NEXT: v_mov_b32_e32 [[TEMP_VGPR:v[0-9]+]], [[FP_SCRATCH_COPY]] -; WAVE32-NEXT: buffer_store_dword [[TEMP_VGPR]], off, s[0:3], s33 offset:200 ; 4-byte Folded Spill -; WAVE32: buffer_store_dword v40, off, s[0:3], s33 offset -; WAVE32-COUNT-47: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 -; WAVE32: v_writelane_b32 [[RES_VGPR]], s34, 0 -; WAVE32-COUNT-31: v_writelane_b32 [[RES_VGPR]], s{{[0-9]+}}, {{[0-9]+}} - - define void @callee_need_to_spill_fp_exec_to_memory() #2 { +; WAVE64-LABEL: callee_need_to_spill_fp_exec_to_memory: +; WAVE64: .Lfunc_begin5: +; WAVE64-NEXT: .cfi_startproc +; WAVE64-NEXT: ; %bb.0: +; WAVE64-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE64-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE64-NEXT: .cfi_undefined 2560 +; WAVE64-NEXT: .cfi_undefined 2561 +; WAVE64-NEXT: .cfi_undefined 2562 +; WAVE64-NEXT: .cfi_undefined 2563 +; WAVE64-NEXT: .cfi_undefined 2564 +; WAVE64-NEXT: .cfi_undefined 2565 +; WAVE64-NEXT: .cfi_undefined 2566 +; WAVE64-NEXT: .cfi_undefined 2567 +; WAVE64-NEXT: .cfi_undefined 2568 +; WAVE64-NEXT: .cfi_undefined 2569 +; WAVE64-NEXT: .cfi_undefined 2570 +; WAVE64-NEXT: .cfi_undefined 2571 +; WAVE64-NEXT: .cfi_undefined 2572 +; WAVE64-NEXT: .cfi_undefined 2573 +; WAVE64-NEXT: .cfi_undefined 2574 +; WAVE64-NEXT: .cfi_undefined 2575 +; WAVE64-NEXT: .cfi_undefined 2576 +; WAVE64-NEXT: .cfi_undefined 2577 +; WAVE64-NEXT: .cfi_undefined 2578 +; WAVE64-NEXT: .cfi_undefined 2579 +; WAVE64-NEXT: .cfi_undefined 2580 +; WAVE64-NEXT: .cfi_undefined 2581 +; WAVE64-NEXT: .cfi_undefined 2582 +; WAVE64-NEXT: .cfi_undefined 2583 +; WAVE64-NEXT: .cfi_undefined 2584 +; WAVE64-NEXT: .cfi_undefined 2585 +; WAVE64-NEXT: .cfi_undefined 2586 +; WAVE64-NEXT: .cfi_undefined 2587 +; WAVE64-NEXT: .cfi_undefined 2588 +; WAVE64-NEXT: .cfi_undefined 2589 +; WAVE64-NEXT: .cfi_undefined 2590 +; WAVE64-NEXT: .cfi_undefined 2591 +; WAVE64-NEXT: .cfi_undefined 2592 +; WAVE64-NEXT: .cfi_undefined 2593 +; WAVE64-NEXT: .cfi_undefined 2594 +; WAVE64-NEXT: .cfi_undefined 2595 +; WAVE64-NEXT: .cfi_undefined 2596 +; WAVE64-NEXT: .cfi_undefined 2597 +; WAVE64-NEXT: .cfi_undefined 2598 +; WAVE64-NEXT: .cfi_undefined 2599 +; WAVE64-NEXT: .cfi_undefined 2608 +; WAVE64-NEXT: .cfi_undefined 2609 +; WAVE64-NEXT: .cfi_undefined 2610 +; WAVE64-NEXT: .cfi_undefined 2611 +; WAVE64-NEXT: .cfi_undefined 2612 +; WAVE64-NEXT: .cfi_undefined 2613 +; WAVE64-NEXT: .cfi_undefined 2614 +; WAVE64-NEXT: .cfi_undefined 2615 +; WAVE64-NEXT: .cfi_undefined 2624 +; WAVE64-NEXT: .cfi_undefined 2625 +; WAVE64-NEXT: .cfi_undefined 2626 +; WAVE64-NEXT: .cfi_undefined 2627 +; WAVE64-NEXT: .cfi_undefined 2628 +; WAVE64-NEXT: .cfi_undefined 2629 +; WAVE64-NEXT: .cfi_undefined 2630 +; WAVE64-NEXT: .cfi_undefined 2631 +; WAVE64-NEXT: .cfi_undefined 2640 +; WAVE64-NEXT: .cfi_undefined 2641 +; WAVE64-NEXT: .cfi_undefined 2642 +; WAVE64-NEXT: .cfi_undefined 2643 +; WAVE64-NEXT: .cfi_undefined 2644 +; WAVE64-NEXT: .cfi_undefined 2645 +; WAVE64-NEXT: .cfi_undefined 2646 +; WAVE64-NEXT: .cfi_undefined 2647 +; WAVE64-NEXT: .cfi_undefined 2656 +; WAVE64-NEXT: .cfi_undefined 2657 +; WAVE64-NEXT: .cfi_undefined 2658 +; WAVE64-NEXT: .cfi_undefined 2659 +; WAVE64-NEXT: .cfi_undefined 2660 +; WAVE64-NEXT: .cfi_undefined 2661 +; WAVE64-NEXT: .cfi_undefined 2662 +; WAVE64-NEXT: .cfi_undefined 2663 +; WAVE64-NEXT: .cfi_undefined 2672 +; WAVE64-NEXT: .cfi_undefined 2673 +; WAVE64-NEXT: .cfi_undefined 2674 +; WAVE64-NEXT: .cfi_undefined 2675 +; WAVE64-NEXT: .cfi_undefined 2676 +; WAVE64-NEXT: .cfi_undefined 2677 +; WAVE64-NEXT: .cfi_undefined 2678 +; WAVE64-NEXT: .cfi_undefined 2679 +; WAVE64-NEXT: .cfi_undefined 2688 +; WAVE64-NEXT: .cfi_undefined 2689 +; WAVE64-NEXT: .cfi_undefined 36 +; WAVE64-NEXT: .cfi_undefined 37 +; WAVE64-NEXT: .cfi_undefined 38 +; WAVE64-NEXT: .cfi_undefined 39 +; WAVE64-NEXT: .cfi_undefined 40 +; WAVE64-NEXT: .cfi_undefined 41 +; WAVE64-NEXT: .cfi_undefined 42 +; WAVE64-NEXT: .cfi_undefined 43 +; WAVE64-NEXT: .cfi_undefined 44 +; WAVE64-NEXT: .cfi_undefined 45 +; WAVE64-NEXT: .cfi_undefined 46 +; WAVE64-NEXT: .cfi_undefined 47 +; WAVE64-NEXT: .cfi_undefined 48 +; WAVE64-NEXT: .cfi_undefined 49 +; WAVE64-NEXT: .cfi_undefined 50 +; WAVE64-NEXT: .cfi_undefined 51 +; WAVE64-NEXT: .cfi_undefined 52 +; WAVE64-NEXT: .cfi_undefined 53 +; WAVE64-NEXT: .cfi_undefined 54 +; WAVE64-NEXT: .cfi_undefined 55 +; WAVE64-NEXT: .cfi_undefined 56 +; WAVE64-NEXT: .cfi_undefined 57 +; WAVE64-NEXT: .cfi_undefined 58 +; WAVE64-NEXT: .cfi_undefined 59 +; WAVE64-NEXT: .cfi_undefined 60 +; WAVE64-NEXT: .cfi_undefined 61 +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: s_mov_b32 s40, s33 +; WAVE64-NEXT: .cfi_register 65, 72 +; WAVE64-NEXT: s_mov_b32 s33, s32 +; WAVE64-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; WAVE64-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_offset 2599, 12288 +; WAVE64-NEXT: s_mov_b64 exec, s[4:5] +; WAVE64-NEXT: v_writelane_b32 v39, exec_lo, 32 +; WAVE64-NEXT: v_writelane_b32 v39, exec_hi, 33 +; WAVE64-NEXT: .cfi_llvm_vector_registers 17, 2599, 32, 32, 2599, 33, 32 +; WAVE64-NEXT: .cfi_def_cfa_register 65 +; WAVE64-NEXT: s_addk_i32 s32, 0x3200 +; WAVE64-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 12032 +; WAVE64-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 11776 +; WAVE64-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2602, 32, 17, 64, 11520 +; WAVE64-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2603, 32, 17, 64, 11264 +; WAVE64-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2604, 32, 17, 64, 11008 +; WAVE64-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2605, 32, 17, 64, 10752 +; WAVE64-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2606, 32, 17, 64, 10496 +; WAVE64-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2607, 32, 17, 64, 10240 +; WAVE64-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2616, 32, 17, 64, 9984 +; WAVE64-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2617, 32, 17, 64, 9728 +; WAVE64-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2618, 32, 17, 64, 9472 +; WAVE64-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2619, 32, 17, 64, 9216 +; WAVE64-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2620, 32, 17, 64, 8960 +; WAVE64-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2621, 32, 17, 64, 8704 +; WAVE64-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2622, 32, 17, 64, 8448 +; WAVE64-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2623, 32, 17, 64, 8192 +; WAVE64-NEXT: buffer_store_dword v72, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2632, 32, 17, 64, 7936 +; WAVE64-NEXT: buffer_store_dword v73, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2633, 32, 17, 64, 7680 +; WAVE64-NEXT: buffer_store_dword v74, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2634, 32, 17, 64, 7424 +; WAVE64-NEXT: buffer_store_dword v75, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2635, 32, 17, 64, 7168 +; WAVE64-NEXT: buffer_store_dword v76, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2636, 32, 17, 64, 6912 +; WAVE64-NEXT: buffer_store_dword v77, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2637, 32, 17, 64, 6656 +; WAVE64-NEXT: buffer_store_dword v78, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2638, 32, 17, 64, 6400 +; WAVE64-NEXT: buffer_store_dword v79, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2639, 32, 17, 64, 6144 +; WAVE64-NEXT: buffer_store_dword v88, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2648, 32, 17, 64, 5888 +; WAVE64-NEXT: buffer_store_dword v89, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2649, 32, 17, 64, 5632 +; WAVE64-NEXT: buffer_store_dword v90, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2650, 32, 17, 64, 5376 +; WAVE64-NEXT: buffer_store_dword v91, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2651, 32, 17, 64, 5120 +; WAVE64-NEXT: buffer_store_dword v92, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2652, 32, 17, 64, 4864 +; WAVE64-NEXT: buffer_store_dword v93, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2653, 32, 17, 64, 4608 +; WAVE64-NEXT: buffer_store_dword v94, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2654, 32, 17, 64, 4352 +; WAVE64-NEXT: buffer_store_dword v95, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2655, 32, 17, 64, 4096 +; WAVE64-NEXT: buffer_store_dword v104, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2664, 32, 17, 64, 3840 +; WAVE64-NEXT: buffer_store_dword v105, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2665, 32, 17, 64, 3584 +; WAVE64-NEXT: buffer_store_dword v106, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2666, 32, 17, 64, 3328 +; WAVE64-NEXT: buffer_store_dword v107, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2667, 32, 17, 64, 3072 +; WAVE64-NEXT: buffer_store_dword v108, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2668, 32, 17, 64, 2816 +; WAVE64-NEXT: buffer_store_dword v109, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2669, 32, 17, 64, 2560 +; WAVE64-NEXT: buffer_store_dword v110, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2670, 32, 17, 64, 2304 +; WAVE64-NEXT: buffer_store_dword v111, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2671, 32, 17, 64, 2048 +; WAVE64-NEXT: buffer_store_dword v120, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2680, 32, 17, 64, 1792 +; WAVE64-NEXT: buffer_store_dword v121, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2681, 32, 17, 64, 1536 +; WAVE64-NEXT: buffer_store_dword v122, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2682, 32, 17, 64, 1280 +; WAVE64-NEXT: buffer_store_dword v123, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2683, 32, 17, 64, 1024 +; WAVE64-NEXT: buffer_store_dword v124, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2684, 32, 17, 64, 768 +; WAVE64-NEXT: buffer_store_dword v125, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2685, 32, 17, 64, 512 +; WAVE64-NEXT: buffer_store_dword v126, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2686, 32, 17, 64, 256 +; WAVE64-NEXT: buffer_store_dword v127, off, s[0:3], s33 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2687, 32, 17, 64, 0 +; WAVE64-NEXT: v_writelane_b32 v39, s34, 0 +; WAVE64-NEXT: .cfi_llvm_vector_registers 66, 2599, 0, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s35, 1 +; WAVE64-NEXT: .cfi_llvm_vector_registers 67, 2599, 1, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s36, 2 +; WAVE64-NEXT: .cfi_llvm_vector_registers 68, 2599, 2, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s37, 3 +; WAVE64-NEXT: .cfi_llvm_vector_registers 69, 2599, 3, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s38, 4 +; WAVE64-NEXT: .cfi_llvm_vector_registers 70, 2599, 4, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s39, 5 +; WAVE64-NEXT: .cfi_llvm_vector_registers 71, 2599, 5, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s48, 6 +; WAVE64-NEXT: .cfi_llvm_vector_registers 80, 2599, 6, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s49, 7 +; WAVE64-NEXT: .cfi_llvm_vector_registers 81, 2599, 7, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s50, 8 +; WAVE64-NEXT: .cfi_llvm_vector_registers 82, 2599, 8, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s51, 9 +; WAVE64-NEXT: .cfi_llvm_vector_registers 83, 2599, 9, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s52, 10 +; WAVE64-NEXT: .cfi_llvm_vector_registers 84, 2599, 10, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s53, 11 +; WAVE64-NEXT: .cfi_llvm_vector_registers 85, 2599, 11, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s54, 12 +; WAVE64-NEXT: .cfi_llvm_vector_registers 86, 2599, 12, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s55, 13 +; WAVE64-NEXT: .cfi_llvm_vector_registers 87, 2599, 13, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s64, 14 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1088, 2599, 14, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s65, 15 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1089, 2599, 15, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s66, 16 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1090, 2599, 16, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s67, 17 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1091, 2599, 17, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s68, 18 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1092, 2599, 18, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s69, 19 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1093, 2599, 19, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s70, 20 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1094, 2599, 20, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s71, 21 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1095, 2599, 21, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s80, 22 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1104, 2599, 22, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s81, 23 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1105, 2599, 23, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s82, 24 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1106, 2599, 24, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s83, 25 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1107, 2599, 25, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s84, 26 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1108, 2599, 26, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s85, 27 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1109, 2599, 27, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s86, 28 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1110, 2599, 28, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s87, 29 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1111, 2599, 29, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s96, 30 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1120, 2599, 30, 32 +; WAVE64-NEXT: v_writelane_b32 v39, s97, 31 +; WAVE64-NEXT: .cfi_llvm_vector_registers 1121, 2599, 31, 32 +; WAVE64-NEXT: ;;#ASMSTART +; WAVE64-NEXT: ; clobber nonpreserved and 32 CSR SGPRs +; WAVE64-NEXT: ;;#ASMEND +; WAVE64-NEXT: ;;#ASMSTART +; WAVE64-NEXT: ; clobber all VGPRs except v39 +; WAVE64-NEXT: ;;#ASMEND +; WAVE64-NEXT: buffer_load_dword v127, off, s[0:3], s33 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v126, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v125, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v124, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v123, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v122, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v121, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v120, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v111, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v110, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v109, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v108, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v107, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v106, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v105, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v104, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v95, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v92, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v91, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v90, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v89, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v88, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v79, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v78, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v77, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v76, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v75, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v74, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v73, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v72, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:172 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:176 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:180 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:184 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:188 ; 4-byte Folded Reload +; WAVE64-NEXT: v_readlane_b32 s97, v39, 31 +; WAVE64-NEXT: v_readlane_b32 s96, v39, 30 +; WAVE64-NEXT: v_readlane_b32 s87, v39, 29 +; WAVE64-NEXT: v_readlane_b32 s86, v39, 28 +; WAVE64-NEXT: v_readlane_b32 s85, v39, 27 +; WAVE64-NEXT: v_readlane_b32 s84, v39, 26 +; WAVE64-NEXT: v_readlane_b32 s83, v39, 25 +; WAVE64-NEXT: v_readlane_b32 s82, v39, 24 +; WAVE64-NEXT: v_readlane_b32 s81, v39, 23 +; WAVE64-NEXT: v_readlane_b32 s80, v39, 22 +; WAVE64-NEXT: v_readlane_b32 s71, v39, 21 +; WAVE64-NEXT: v_readlane_b32 s70, v39, 20 +; WAVE64-NEXT: v_readlane_b32 s69, v39, 19 +; WAVE64-NEXT: v_readlane_b32 s68, v39, 18 +; WAVE64-NEXT: v_readlane_b32 s67, v39, 17 +; WAVE64-NEXT: v_readlane_b32 s66, v39, 16 +; WAVE64-NEXT: v_readlane_b32 s65, v39, 15 +; WAVE64-NEXT: v_readlane_b32 s64, v39, 14 +; WAVE64-NEXT: v_readlane_b32 s55, v39, 13 +; WAVE64-NEXT: v_readlane_b32 s54, v39, 12 +; WAVE64-NEXT: v_readlane_b32 s53, v39, 11 +; WAVE64-NEXT: v_readlane_b32 s52, v39, 10 +; WAVE64-NEXT: v_readlane_b32 s51, v39, 9 +; WAVE64-NEXT: v_readlane_b32 s50, v39, 8 +; WAVE64-NEXT: v_readlane_b32 s49, v39, 7 +; WAVE64-NEXT: v_readlane_b32 s48, v39, 6 +; WAVE64-NEXT: v_readlane_b32 s39, v39, 5 +; WAVE64-NEXT: v_readlane_b32 s38, v39, 4 +; WAVE64-NEXT: v_readlane_b32 s37, v39, 3 +; WAVE64-NEXT: v_readlane_b32 s36, v39, 2 +; WAVE64-NEXT: v_readlane_b32 s35, v39, 1 +; WAVE64-NEXT: v_readlane_b32 s34, v39, 0 +; WAVE64-NEXT: s_mov_b32 s32, s33 +; WAVE64-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; WAVE64-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:192 ; 4-byte Folded Reload +; WAVE64-NEXT: s_mov_b64 exec, s[4:5] +; WAVE64-NEXT: .cfi_def_cfa_register 64 +; WAVE64-NEXT: s_mov_b32 s33, s40 +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: callee_need_to_spill_fp_exec_to_memory: +; WAVE32: .Lfunc_begin5: +; WAVE32-NEXT: .cfi_startproc +; WAVE32-NEXT: ; %bb.0: +; WAVE32-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE32-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE32-NEXT: .cfi_undefined 1536 +; WAVE32-NEXT: .cfi_undefined 1537 +; WAVE32-NEXT: .cfi_undefined 1538 +; WAVE32-NEXT: .cfi_undefined 1539 +; WAVE32-NEXT: .cfi_undefined 1540 +; WAVE32-NEXT: .cfi_undefined 1541 +; WAVE32-NEXT: .cfi_undefined 1542 +; WAVE32-NEXT: .cfi_undefined 1543 +; WAVE32-NEXT: .cfi_undefined 1544 +; WAVE32-NEXT: .cfi_undefined 1545 +; WAVE32-NEXT: .cfi_undefined 1546 +; WAVE32-NEXT: .cfi_undefined 1547 +; WAVE32-NEXT: .cfi_undefined 1548 +; WAVE32-NEXT: .cfi_undefined 1549 +; WAVE32-NEXT: .cfi_undefined 1550 +; WAVE32-NEXT: .cfi_undefined 1551 +; WAVE32-NEXT: .cfi_undefined 1552 +; WAVE32-NEXT: .cfi_undefined 1553 +; WAVE32-NEXT: .cfi_undefined 1554 +; WAVE32-NEXT: .cfi_undefined 1555 +; WAVE32-NEXT: .cfi_undefined 1556 +; WAVE32-NEXT: .cfi_undefined 1557 +; WAVE32-NEXT: .cfi_undefined 1558 +; WAVE32-NEXT: .cfi_undefined 1559 +; WAVE32-NEXT: .cfi_undefined 1560 +; WAVE32-NEXT: .cfi_undefined 1561 +; WAVE32-NEXT: .cfi_undefined 1562 +; WAVE32-NEXT: .cfi_undefined 1563 +; WAVE32-NEXT: .cfi_undefined 1564 +; WAVE32-NEXT: .cfi_undefined 1565 +; WAVE32-NEXT: .cfi_undefined 1566 +; WAVE32-NEXT: .cfi_undefined 1567 +; WAVE32-NEXT: .cfi_undefined 1568 +; WAVE32-NEXT: .cfi_undefined 1569 +; WAVE32-NEXT: .cfi_undefined 1570 +; WAVE32-NEXT: .cfi_undefined 1571 +; WAVE32-NEXT: .cfi_undefined 1572 +; WAVE32-NEXT: .cfi_undefined 1573 +; WAVE32-NEXT: .cfi_undefined 1574 +; WAVE32-NEXT: .cfi_undefined 1575 +; WAVE32-NEXT: .cfi_undefined 1584 +; WAVE32-NEXT: .cfi_undefined 1585 +; WAVE32-NEXT: .cfi_undefined 1586 +; WAVE32-NEXT: .cfi_undefined 1587 +; WAVE32-NEXT: .cfi_undefined 1588 +; WAVE32-NEXT: .cfi_undefined 1589 +; WAVE32-NEXT: .cfi_undefined 1590 +; WAVE32-NEXT: .cfi_undefined 1591 +; WAVE32-NEXT: .cfi_undefined 1600 +; WAVE32-NEXT: .cfi_undefined 1601 +; WAVE32-NEXT: .cfi_undefined 1602 +; WAVE32-NEXT: .cfi_undefined 1603 +; WAVE32-NEXT: .cfi_undefined 1604 +; WAVE32-NEXT: .cfi_undefined 1605 +; WAVE32-NEXT: .cfi_undefined 1606 +; WAVE32-NEXT: .cfi_undefined 1607 +; WAVE32-NEXT: .cfi_undefined 1616 +; WAVE32-NEXT: .cfi_undefined 1617 +; WAVE32-NEXT: .cfi_undefined 1618 +; WAVE32-NEXT: .cfi_undefined 1619 +; WAVE32-NEXT: .cfi_undefined 1620 +; WAVE32-NEXT: .cfi_undefined 1621 +; WAVE32-NEXT: .cfi_undefined 1622 +; WAVE32-NEXT: .cfi_undefined 1623 +; WAVE32-NEXT: .cfi_undefined 1632 +; WAVE32-NEXT: .cfi_undefined 1633 +; WAVE32-NEXT: .cfi_undefined 1634 +; WAVE32-NEXT: .cfi_undefined 1635 +; WAVE32-NEXT: .cfi_undefined 1636 +; WAVE32-NEXT: .cfi_undefined 1637 +; WAVE32-NEXT: .cfi_undefined 1638 +; WAVE32-NEXT: .cfi_undefined 1639 +; WAVE32-NEXT: .cfi_undefined 1648 +; WAVE32-NEXT: .cfi_undefined 1649 +; WAVE32-NEXT: .cfi_undefined 1650 +; WAVE32-NEXT: .cfi_undefined 1651 +; WAVE32-NEXT: .cfi_undefined 1652 +; WAVE32-NEXT: .cfi_undefined 1653 +; WAVE32-NEXT: .cfi_undefined 1654 +; WAVE32-NEXT: .cfi_undefined 1655 +; WAVE32-NEXT: .cfi_undefined 1664 +; WAVE32-NEXT: .cfi_undefined 1665 +; WAVE32-NEXT: .cfi_undefined 36 +; WAVE32-NEXT: .cfi_undefined 37 +; WAVE32-NEXT: .cfi_undefined 38 +; WAVE32-NEXT: .cfi_undefined 39 +; WAVE32-NEXT: .cfi_undefined 40 +; WAVE32-NEXT: .cfi_undefined 41 +; WAVE32-NEXT: .cfi_undefined 42 +; WAVE32-NEXT: .cfi_undefined 43 +; WAVE32-NEXT: .cfi_undefined 44 +; WAVE32-NEXT: .cfi_undefined 45 +; WAVE32-NEXT: .cfi_undefined 46 +; WAVE32-NEXT: .cfi_undefined 47 +; WAVE32-NEXT: .cfi_undefined 48 +; WAVE32-NEXT: .cfi_undefined 49 +; WAVE32-NEXT: .cfi_undefined 50 +; WAVE32-NEXT: .cfi_undefined 51 +; WAVE32-NEXT: .cfi_undefined 52 +; WAVE32-NEXT: .cfi_undefined 53 +; WAVE32-NEXT: .cfi_undefined 54 +; WAVE32-NEXT: .cfi_undefined 55 +; WAVE32-NEXT: .cfi_undefined 56 +; WAVE32-NEXT: .cfi_undefined 57 +; WAVE32-NEXT: .cfi_undefined 58 +; WAVE32-NEXT: .cfi_undefined 59 +; WAVE32-NEXT: .cfi_undefined 60 +; WAVE32-NEXT: .cfi_undefined 61 +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: s_mov_b32 s40, s33 +; WAVE32-NEXT: .cfi_register 65, 72 +; WAVE32-NEXT: s_mov_b32 s33, s32 +; WAVE32-NEXT: s_xor_saveexec_b32 s4, -1 +; WAVE32-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_offset 1575, 6144 +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, s4 +; WAVE32-NEXT: v_mov_b32_e32 v0, exec_lo +; WAVE32-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_offset 1, 6272 +; WAVE32-NEXT: .cfi_def_cfa_register 65 +; WAVE32-NEXT: s_addk_i32 s32, 0x1980 +; WAVE32-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1576, 32, 1, 32, 6016 +; WAVE32-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1577, 32, 1, 32, 5888 +; WAVE32-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1578, 32, 1, 32, 5760 +; WAVE32-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1579, 32, 1, 32, 5632 +; WAVE32-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1580, 32, 1, 32, 5504 +; WAVE32-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1581, 32, 1, 32, 5376 +; WAVE32-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1582, 32, 1, 32, 5248 +; WAVE32-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1583, 32, 1, 32, 5120 +; WAVE32-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1592, 32, 1, 32, 4992 +; WAVE32-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1593, 32, 1, 32, 4864 +; WAVE32-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1594, 32, 1, 32, 4736 +; WAVE32-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1595, 32, 1, 32, 4608 +; WAVE32-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1596, 32, 1, 32, 4480 +; WAVE32-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1597, 32, 1, 32, 4352 +; WAVE32-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1598, 32, 1, 32, 4224 +; WAVE32-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1599, 32, 1, 32, 4096 +; WAVE32-NEXT: buffer_store_dword v72, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1608, 32, 1, 32, 3968 +; WAVE32-NEXT: buffer_store_dword v73, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1609, 32, 1, 32, 3840 +; WAVE32-NEXT: buffer_store_dword v74, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1610, 32, 1, 32, 3712 +; WAVE32-NEXT: buffer_store_dword v75, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1611, 32, 1, 32, 3584 +; WAVE32-NEXT: buffer_store_dword v76, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1612, 32, 1, 32, 3456 +; WAVE32-NEXT: buffer_store_dword v77, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1613, 32, 1, 32, 3328 +; WAVE32-NEXT: buffer_store_dword v78, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1614, 32, 1, 32, 3200 +; WAVE32-NEXT: buffer_store_dword v79, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1615, 32, 1, 32, 3072 +; WAVE32-NEXT: buffer_store_dword v88, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1624, 32, 1, 32, 2944 +; WAVE32-NEXT: buffer_store_dword v89, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1625, 32, 1, 32, 2816 +; WAVE32-NEXT: buffer_store_dword v90, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1626, 32, 1, 32, 2688 +; WAVE32-NEXT: buffer_store_dword v91, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1627, 32, 1, 32, 2560 +; WAVE32-NEXT: buffer_store_dword v92, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1628, 32, 1, 32, 2432 +; WAVE32-NEXT: buffer_store_dword v93, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1629, 32, 1, 32, 2304 +; WAVE32-NEXT: buffer_store_dword v94, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1630, 32, 1, 32, 2176 +; WAVE32-NEXT: buffer_store_dword v95, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1631, 32, 1, 32, 2048 +; WAVE32-NEXT: buffer_store_dword v104, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1640, 32, 1, 32, 1920 +; WAVE32-NEXT: buffer_store_dword v105, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1641, 32, 1, 32, 1792 +; WAVE32-NEXT: buffer_store_dword v106, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1642, 32, 1, 32, 1664 +; WAVE32-NEXT: buffer_store_dword v107, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1643, 32, 1, 32, 1536 +; WAVE32-NEXT: buffer_store_dword v108, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1644, 32, 1, 32, 1408 +; WAVE32-NEXT: buffer_store_dword v109, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1645, 32, 1, 32, 1280 +; WAVE32-NEXT: buffer_store_dword v110, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1646, 32, 1, 32, 1152 +; WAVE32-NEXT: buffer_store_dword v111, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1647, 32, 1, 32, 1024 +; WAVE32-NEXT: buffer_store_dword v120, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1656, 32, 1, 32, 896 +; WAVE32-NEXT: buffer_store_dword v121, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1657, 32, 1, 32, 768 +; WAVE32-NEXT: buffer_store_dword v122, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1658, 32, 1, 32, 640 +; WAVE32-NEXT: buffer_store_dword v123, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1659, 32, 1, 32, 512 +; WAVE32-NEXT: buffer_store_dword v124, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1660, 32, 1, 32, 384 +; WAVE32-NEXT: buffer_store_dword v125, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1661, 32, 1, 32, 256 +; WAVE32-NEXT: buffer_store_dword v126, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1662, 32, 1, 32, 128 +; WAVE32-NEXT: buffer_store_dword v127, off, s[0:3], s33 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1663, 32, 1, 32, 0 +; WAVE32-NEXT: v_writelane_b32 v39, s34, 0 +; WAVE32-NEXT: .cfi_llvm_vector_registers 66, 1575, 0, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s35, 1 +; WAVE32-NEXT: .cfi_llvm_vector_registers 67, 1575, 1, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s36, 2 +; WAVE32-NEXT: .cfi_llvm_vector_registers 68, 1575, 2, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s37, 3 +; WAVE32-NEXT: .cfi_llvm_vector_registers 69, 1575, 3, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s38, 4 +; WAVE32-NEXT: .cfi_llvm_vector_registers 70, 1575, 4, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s39, 5 +; WAVE32-NEXT: .cfi_llvm_vector_registers 71, 1575, 5, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s48, 6 +; WAVE32-NEXT: .cfi_llvm_vector_registers 80, 1575, 6, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s49, 7 +; WAVE32-NEXT: .cfi_llvm_vector_registers 81, 1575, 7, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s50, 8 +; WAVE32-NEXT: .cfi_llvm_vector_registers 82, 1575, 8, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s51, 9 +; WAVE32-NEXT: .cfi_llvm_vector_registers 83, 1575, 9, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s52, 10 +; WAVE32-NEXT: .cfi_llvm_vector_registers 84, 1575, 10, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s53, 11 +; WAVE32-NEXT: .cfi_llvm_vector_registers 85, 1575, 11, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s54, 12 +; WAVE32-NEXT: .cfi_llvm_vector_registers 86, 1575, 12, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s55, 13 +; WAVE32-NEXT: .cfi_llvm_vector_registers 87, 1575, 13, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s64, 14 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1088, 1575, 14, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s65, 15 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1089, 1575, 15, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s66, 16 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1090, 1575, 16, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s67, 17 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1091, 1575, 17, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s68, 18 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1092, 1575, 18, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s69, 19 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1093, 1575, 19, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s70, 20 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1094, 1575, 20, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s71, 21 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1095, 1575, 21, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s80, 22 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1104, 1575, 22, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s81, 23 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1105, 1575, 23, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s82, 24 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1106, 1575, 24, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s83, 25 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1107, 1575, 25, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s84, 26 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1108, 1575, 26, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s85, 27 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1109, 1575, 27, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s86, 28 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1110, 1575, 28, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s87, 29 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1111, 1575, 29, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s96, 30 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1120, 1575, 30, 32 +; WAVE32-NEXT: v_writelane_b32 v39, s97, 31 +; WAVE32-NEXT: .cfi_llvm_vector_registers 1121, 1575, 31, 32 +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber nonpreserved and 32 CSR SGPRs +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber all VGPRs except v39 +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: s_clause 0x2f +; WAVE32-NEXT: buffer_load_dword v127, off, s[0:3], s33 +; WAVE32-NEXT: buffer_load_dword v126, off, s[0:3], s33 offset:4 +; WAVE32-NEXT: buffer_load_dword v125, off, s[0:3], s33 offset:8 +; WAVE32-NEXT: buffer_load_dword v124, off, s[0:3], s33 offset:12 +; WAVE32-NEXT: buffer_load_dword v123, off, s[0:3], s33 offset:16 +; WAVE32-NEXT: buffer_load_dword v122, off, s[0:3], s33 offset:20 +; WAVE32-NEXT: buffer_load_dword v121, off, s[0:3], s33 offset:24 +; WAVE32-NEXT: buffer_load_dword v120, off, s[0:3], s33 offset:28 +; WAVE32-NEXT: buffer_load_dword v111, off, s[0:3], s33 offset:32 +; WAVE32-NEXT: buffer_load_dword v110, off, s[0:3], s33 offset:36 +; WAVE32-NEXT: buffer_load_dword v109, off, s[0:3], s33 offset:40 +; WAVE32-NEXT: buffer_load_dword v108, off, s[0:3], s33 offset:44 +; WAVE32-NEXT: buffer_load_dword v107, off, s[0:3], s33 offset:48 +; WAVE32-NEXT: buffer_load_dword v106, off, s[0:3], s33 offset:52 +; WAVE32-NEXT: buffer_load_dword v105, off, s[0:3], s33 offset:56 +; WAVE32-NEXT: buffer_load_dword v104, off, s[0:3], s33 offset:60 +; WAVE32-NEXT: buffer_load_dword v95, off, s[0:3], s33 offset:64 +; WAVE32-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:68 +; WAVE32-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:72 +; WAVE32-NEXT: buffer_load_dword v92, off, s[0:3], s33 offset:76 +; WAVE32-NEXT: buffer_load_dword v91, off, s[0:3], s33 offset:80 +; WAVE32-NEXT: buffer_load_dword v90, off, s[0:3], s33 offset:84 +; WAVE32-NEXT: buffer_load_dword v89, off, s[0:3], s33 offset:88 +; WAVE32-NEXT: buffer_load_dword v88, off, s[0:3], s33 offset:92 +; WAVE32-NEXT: buffer_load_dword v79, off, s[0:3], s33 offset:96 +; WAVE32-NEXT: buffer_load_dword v78, off, s[0:3], s33 offset:100 +; WAVE32-NEXT: buffer_load_dword v77, off, s[0:3], s33 offset:104 +; WAVE32-NEXT: buffer_load_dword v76, off, s[0:3], s33 offset:108 +; WAVE32-NEXT: buffer_load_dword v75, off, s[0:3], s33 offset:112 +; WAVE32-NEXT: buffer_load_dword v74, off, s[0:3], s33 offset:116 +; WAVE32-NEXT: buffer_load_dword v73, off, s[0:3], s33 offset:120 +; WAVE32-NEXT: buffer_load_dword v72, off, s[0:3], s33 offset:124 +; WAVE32-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:128 +; WAVE32-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:132 +; WAVE32-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:136 +; WAVE32-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:140 +; WAVE32-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:144 +; WAVE32-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:148 +; WAVE32-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:152 +; WAVE32-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:156 +; WAVE32-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:160 +; WAVE32-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:164 +; WAVE32-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:168 +; WAVE32-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:172 +; WAVE32-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:176 +; WAVE32-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:180 +; WAVE32-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:184 +; WAVE32-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:188 +; WAVE32-NEXT: v_readlane_b32 s97, v39, 31 +; WAVE32-NEXT: v_readlane_b32 s96, v39, 30 +; WAVE32-NEXT: v_readlane_b32 s87, v39, 29 +; WAVE32-NEXT: v_readlane_b32 s86, v39, 28 +; WAVE32-NEXT: v_readlane_b32 s85, v39, 27 +; WAVE32-NEXT: v_readlane_b32 s84, v39, 26 +; WAVE32-NEXT: v_readlane_b32 s83, v39, 25 +; WAVE32-NEXT: v_readlane_b32 s82, v39, 24 +; WAVE32-NEXT: v_readlane_b32 s81, v39, 23 +; WAVE32-NEXT: v_readlane_b32 s80, v39, 22 +; WAVE32-NEXT: v_readlane_b32 s71, v39, 21 +; WAVE32-NEXT: v_readlane_b32 s70, v39, 20 +; WAVE32-NEXT: v_readlane_b32 s69, v39, 19 +; WAVE32-NEXT: v_readlane_b32 s68, v39, 18 +; WAVE32-NEXT: v_readlane_b32 s67, v39, 17 +; WAVE32-NEXT: v_readlane_b32 s66, v39, 16 +; WAVE32-NEXT: v_readlane_b32 s65, v39, 15 +; WAVE32-NEXT: v_readlane_b32 s64, v39, 14 +; WAVE32-NEXT: v_readlane_b32 s55, v39, 13 +; WAVE32-NEXT: v_readlane_b32 s54, v39, 12 +; WAVE32-NEXT: v_readlane_b32 s53, v39, 11 +; WAVE32-NEXT: v_readlane_b32 s52, v39, 10 +; WAVE32-NEXT: v_readlane_b32 s51, v39, 9 +; WAVE32-NEXT: v_readlane_b32 s50, v39, 8 +; WAVE32-NEXT: v_readlane_b32 s49, v39, 7 +; WAVE32-NEXT: v_readlane_b32 s48, v39, 6 +; WAVE32-NEXT: v_readlane_b32 s39, v39, 5 +; WAVE32-NEXT: v_readlane_b32 s38, v39, 4 +; WAVE32-NEXT: v_readlane_b32 s37, v39, 3 +; WAVE32-NEXT: v_readlane_b32 s36, v39, 2 +; WAVE32-NEXT: v_readlane_b32 s35, v39, 1 +; WAVE32-NEXT: v_readlane_b32 s34, v39, 0 +; WAVE32-NEXT: s_mov_b32 s32, s33 +; WAVE32-NEXT: s_xor_saveexec_b32 s4, -1 +; WAVE32-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:192 ; 4-byte Folded Reload +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, s4 +; WAVE32-NEXT: .cfi_def_cfa_register 64 +; WAVE32-NEXT: s_mov_b32 s33, s40 +; WAVE32-NEXT: s_waitcnt vmcnt(0) +; WAVE32-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber nonpreserved and 32 CSR SGPRs", "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29} ,~{s34},~{s35},~{s36},~{s37},~{s38},~{s39} - ,~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49} - ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59} - ,~{s60},~{s61},~{s62},~{s63},~{s64},~{s65} + ,~{s48},~{s49},~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s64},~{s65} + ,~{s66},~{s67},~{s68},~{s69},~{s70},~{s71},~{s80},~{s81},~{s82},~{s83} + ,~{s84},~{s85},~{s86},~{s87},~{s96},~{s97} ,~{vcc}"() call void asm sideeffect "; clobber all VGPRs except v39", @@ -138,6 +1125,326 @@ define void @callee_need_to_spill_fp_exec_to_memory() #2 { } define internal void @caller_needs_to_spill_pc_to_memory() #3 { +; WAVE64-LABEL: caller_needs_to_spill_pc_to_memory: +; WAVE64: .Lfunc_begin6: +; WAVE64-NEXT: .cfi_startproc +; WAVE64-NEXT: ; %bb.0: +; WAVE64-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE64-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE64-NEXT: .cfi_undefined 2560 +; WAVE64-NEXT: .cfi_undefined 2561 +; WAVE64-NEXT: .cfi_undefined 2562 +; WAVE64-NEXT: .cfi_undefined 2563 +; WAVE64-NEXT: .cfi_undefined 2564 +; WAVE64-NEXT: .cfi_undefined 2565 +; WAVE64-NEXT: .cfi_undefined 2566 +; WAVE64-NEXT: .cfi_undefined 2567 +; WAVE64-NEXT: .cfi_undefined 2568 +; WAVE64-NEXT: .cfi_undefined 2569 +; WAVE64-NEXT: .cfi_undefined 2570 +; WAVE64-NEXT: .cfi_undefined 2571 +; WAVE64-NEXT: .cfi_undefined 2572 +; WAVE64-NEXT: .cfi_undefined 2573 +; WAVE64-NEXT: .cfi_undefined 2574 +; WAVE64-NEXT: .cfi_undefined 2575 +; WAVE64-NEXT: .cfi_undefined 2576 +; WAVE64-NEXT: .cfi_undefined 2577 +; WAVE64-NEXT: .cfi_undefined 2578 +; WAVE64-NEXT: .cfi_undefined 2579 +; WAVE64-NEXT: .cfi_undefined 2580 +; WAVE64-NEXT: .cfi_undefined 2581 +; WAVE64-NEXT: .cfi_undefined 2582 +; WAVE64-NEXT: .cfi_undefined 2583 +; WAVE64-NEXT: .cfi_undefined 2584 +; WAVE64-NEXT: .cfi_undefined 2585 +; WAVE64-NEXT: .cfi_undefined 2586 +; WAVE64-NEXT: .cfi_undefined 2587 +; WAVE64-NEXT: .cfi_undefined 2588 +; WAVE64-NEXT: .cfi_undefined 2589 +; WAVE64-NEXT: .cfi_undefined 2590 +; WAVE64-NEXT: .cfi_undefined 2591 +; WAVE64-NEXT: .cfi_undefined 2592 +; WAVE64-NEXT: .cfi_undefined 2593 +; WAVE64-NEXT: .cfi_undefined 2594 +; WAVE64-NEXT: .cfi_undefined 2595 +; WAVE64-NEXT: .cfi_undefined 2596 +; WAVE64-NEXT: .cfi_undefined 2597 +; WAVE64-NEXT: .cfi_undefined 2598 +; WAVE64-NEXT: .cfi_undefined 2599 +; WAVE64-NEXT: .cfi_undefined 2608 +; WAVE64-NEXT: .cfi_undefined 2609 +; WAVE64-NEXT: .cfi_undefined 2610 +; WAVE64-NEXT: .cfi_undefined 2611 +; WAVE64-NEXT: .cfi_undefined 2612 +; WAVE64-NEXT: .cfi_undefined 2613 +; WAVE64-NEXT: .cfi_undefined 2614 +; WAVE64-NEXT: .cfi_undefined 2615 +; WAVE64-NEXT: .cfi_undefined 2624 +; WAVE64-NEXT: .cfi_undefined 2625 +; WAVE64-NEXT: .cfi_undefined 2626 +; WAVE64-NEXT: .cfi_undefined 2627 +; WAVE64-NEXT: .cfi_undefined 2628 +; WAVE64-NEXT: .cfi_undefined 2629 +; WAVE64-NEXT: .cfi_undefined 2630 +; WAVE64-NEXT: .cfi_undefined 2631 +; WAVE64-NEXT: .cfi_undefined 2640 +; WAVE64-NEXT: .cfi_undefined 2641 +; WAVE64-NEXT: .cfi_undefined 2642 +; WAVE64-NEXT: .cfi_undefined 2643 +; WAVE64-NEXT: .cfi_undefined 2644 +; WAVE64-NEXT: .cfi_undefined 2645 +; WAVE64-NEXT: .cfi_undefined 2646 +; WAVE64-NEXT: .cfi_undefined 2647 +; WAVE64-NEXT: .cfi_undefined 2656 +; WAVE64-NEXT: .cfi_undefined 2657 +; WAVE64-NEXT: .cfi_undefined 2658 +; WAVE64-NEXT: .cfi_undefined 2659 +; WAVE64-NEXT: .cfi_undefined 2660 +; WAVE64-NEXT: .cfi_undefined 2661 +; WAVE64-NEXT: .cfi_undefined 2662 +; WAVE64-NEXT: .cfi_undefined 2663 +; WAVE64-NEXT: .cfi_undefined 2672 +; WAVE64-NEXT: .cfi_undefined 2673 +; WAVE64-NEXT: .cfi_undefined 2674 +; WAVE64-NEXT: .cfi_undefined 2675 +; WAVE64-NEXT: .cfi_undefined 2676 +; WAVE64-NEXT: .cfi_undefined 2677 +; WAVE64-NEXT: .cfi_undefined 2678 +; WAVE64-NEXT: .cfi_undefined 2679 +; WAVE64-NEXT: .cfi_undefined 2688 +; WAVE64-NEXT: .cfi_undefined 2689 +; WAVE64-NEXT: .cfi_undefined 2690 +; WAVE64-NEXT: .cfi_undefined 2691 +; WAVE64-NEXT: .cfi_undefined 2692 +; WAVE64-NEXT: .cfi_undefined 2693 +; WAVE64-NEXT: .cfi_undefined 2694 +; WAVE64-NEXT: .cfi_undefined 2695 +; WAVE64-NEXT: .cfi_undefined 2704 +; WAVE64-NEXT: .cfi_undefined 2705 +; WAVE64-NEXT: .cfi_undefined 2706 +; WAVE64-NEXT: .cfi_undefined 2707 +; WAVE64-NEXT: .cfi_undefined 2708 +; WAVE64-NEXT: .cfi_undefined 2709 +; WAVE64-NEXT: .cfi_undefined 2710 +; WAVE64-NEXT: .cfi_undefined 2711 +; WAVE64-NEXT: .cfi_undefined 2720 +; WAVE64-NEXT: .cfi_undefined 2721 +; WAVE64-NEXT: .cfi_undefined 2722 +; WAVE64-NEXT: .cfi_undefined 2723 +; WAVE64-NEXT: .cfi_undefined 2724 +; WAVE64-NEXT: .cfi_undefined 2725 +; WAVE64-NEXT: .cfi_undefined 2726 +; WAVE64-NEXT: .cfi_undefined 2727 +; WAVE64-NEXT: .cfi_undefined 2736 +; WAVE64-NEXT: .cfi_undefined 2737 +; WAVE64-NEXT: .cfi_undefined 2738 +; WAVE64-NEXT: .cfi_undefined 2739 +; WAVE64-NEXT: .cfi_undefined 2740 +; WAVE64-NEXT: .cfi_undefined 2741 +; WAVE64-NEXT: .cfi_undefined 2742 +; WAVE64-NEXT: .cfi_undefined 2743 +; WAVE64-NEXT: .cfi_undefined 2752 +; WAVE64-NEXT: .cfi_undefined 2753 +; WAVE64-NEXT: .cfi_undefined 2754 +; WAVE64-NEXT: .cfi_undefined 2755 +; WAVE64-NEXT: .cfi_undefined 2756 +; WAVE64-NEXT: .cfi_undefined 2757 +; WAVE64-NEXT: .cfi_undefined 2758 +; WAVE64-NEXT: .cfi_undefined 2759 +; WAVE64-NEXT: .cfi_undefined 2768 +; WAVE64-NEXT: .cfi_undefined 2769 +; WAVE64-NEXT: .cfi_undefined 2770 +; WAVE64-NEXT: .cfi_undefined 2771 +; WAVE64-NEXT: .cfi_undefined 2772 +; WAVE64-NEXT: .cfi_undefined 2773 +; WAVE64-NEXT: .cfi_undefined 2774 +; WAVE64-NEXT: .cfi_undefined 2775 +; WAVE64-NEXT: .cfi_undefined 2784 +; WAVE64-NEXT: .cfi_undefined 2785 +; WAVE64-NEXT: .cfi_undefined 2786 +; WAVE64-NEXT: .cfi_undefined 2787 +; WAVE64-NEXT: .cfi_undefined 2788 +; WAVE64-NEXT: .cfi_undefined 2789 +; WAVE64-NEXT: .cfi_undefined 2790 +; WAVE64-NEXT: .cfi_undefined 2791 +; WAVE64-NEXT: .cfi_undefined 2800 +; WAVE64-NEXT: .cfi_undefined 2801 +; WAVE64-NEXT: .cfi_undefined 2802 +; WAVE64-NEXT: .cfi_undefined 2803 +; WAVE64-NEXT: .cfi_undefined 2804 +; WAVE64-NEXT: .cfi_undefined 2805 +; WAVE64-NEXT: .cfi_undefined 2806 +; WAVE64-NEXT: .cfi_undefined 2807 +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: v_mov_b32_e32 v0, exec_lo +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE64-NEXT: v_mov_b32_e32 v0, exec_hi +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_offset 17, 0 +; WAVE64-NEXT: ;;#ASMSTART +; WAVE64-NEXT: ; clobber all VGPRs +; WAVE64-NEXT: ;;#ASMEND +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: caller_needs_to_spill_pc_to_memory: +; WAVE32: .Lfunc_begin6: +; WAVE32-NEXT: .cfi_startproc +; WAVE32-NEXT: ; %bb.0: +; WAVE32-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE32-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE32-NEXT: .cfi_undefined 1536 +; WAVE32-NEXT: .cfi_undefined 1537 +; WAVE32-NEXT: .cfi_undefined 1538 +; WAVE32-NEXT: .cfi_undefined 1539 +; WAVE32-NEXT: .cfi_undefined 1540 +; WAVE32-NEXT: .cfi_undefined 1541 +; WAVE32-NEXT: .cfi_undefined 1542 +; WAVE32-NEXT: .cfi_undefined 1543 +; WAVE32-NEXT: .cfi_undefined 1544 +; WAVE32-NEXT: .cfi_undefined 1545 +; WAVE32-NEXT: .cfi_undefined 1546 +; WAVE32-NEXT: .cfi_undefined 1547 +; WAVE32-NEXT: .cfi_undefined 1548 +; WAVE32-NEXT: .cfi_undefined 1549 +; WAVE32-NEXT: .cfi_undefined 1550 +; WAVE32-NEXT: .cfi_undefined 1551 +; WAVE32-NEXT: .cfi_undefined 1552 +; WAVE32-NEXT: .cfi_undefined 1553 +; WAVE32-NEXT: .cfi_undefined 1554 +; WAVE32-NEXT: .cfi_undefined 1555 +; WAVE32-NEXT: .cfi_undefined 1556 +; WAVE32-NEXT: .cfi_undefined 1557 +; WAVE32-NEXT: .cfi_undefined 1558 +; WAVE32-NEXT: .cfi_undefined 1559 +; WAVE32-NEXT: .cfi_undefined 1560 +; WAVE32-NEXT: .cfi_undefined 1561 +; WAVE32-NEXT: .cfi_undefined 1562 +; WAVE32-NEXT: .cfi_undefined 1563 +; WAVE32-NEXT: .cfi_undefined 1564 +; WAVE32-NEXT: .cfi_undefined 1565 +; WAVE32-NEXT: .cfi_undefined 1566 +; WAVE32-NEXT: .cfi_undefined 1567 +; WAVE32-NEXT: .cfi_undefined 1568 +; WAVE32-NEXT: .cfi_undefined 1569 +; WAVE32-NEXT: .cfi_undefined 1570 +; WAVE32-NEXT: .cfi_undefined 1571 +; WAVE32-NEXT: .cfi_undefined 1572 +; WAVE32-NEXT: .cfi_undefined 1573 +; WAVE32-NEXT: .cfi_undefined 1574 +; WAVE32-NEXT: .cfi_undefined 1575 +; WAVE32-NEXT: .cfi_undefined 1584 +; WAVE32-NEXT: .cfi_undefined 1585 +; WAVE32-NEXT: .cfi_undefined 1586 +; WAVE32-NEXT: .cfi_undefined 1587 +; WAVE32-NEXT: .cfi_undefined 1588 +; WAVE32-NEXT: .cfi_undefined 1589 +; WAVE32-NEXT: .cfi_undefined 1590 +; WAVE32-NEXT: .cfi_undefined 1591 +; WAVE32-NEXT: .cfi_undefined 1600 +; WAVE32-NEXT: .cfi_undefined 1601 +; WAVE32-NEXT: .cfi_undefined 1602 +; WAVE32-NEXT: .cfi_undefined 1603 +; WAVE32-NEXT: .cfi_undefined 1604 +; WAVE32-NEXT: .cfi_undefined 1605 +; WAVE32-NEXT: .cfi_undefined 1606 +; WAVE32-NEXT: .cfi_undefined 1607 +; WAVE32-NEXT: .cfi_undefined 1616 +; WAVE32-NEXT: .cfi_undefined 1617 +; WAVE32-NEXT: .cfi_undefined 1618 +; WAVE32-NEXT: .cfi_undefined 1619 +; WAVE32-NEXT: .cfi_undefined 1620 +; WAVE32-NEXT: .cfi_undefined 1621 +; WAVE32-NEXT: .cfi_undefined 1622 +; WAVE32-NEXT: .cfi_undefined 1623 +; WAVE32-NEXT: .cfi_undefined 1632 +; WAVE32-NEXT: .cfi_undefined 1633 +; WAVE32-NEXT: .cfi_undefined 1634 +; WAVE32-NEXT: .cfi_undefined 1635 +; WAVE32-NEXT: .cfi_undefined 1636 +; WAVE32-NEXT: .cfi_undefined 1637 +; WAVE32-NEXT: .cfi_undefined 1638 +; WAVE32-NEXT: .cfi_undefined 1639 +; WAVE32-NEXT: .cfi_undefined 1648 +; WAVE32-NEXT: .cfi_undefined 1649 +; WAVE32-NEXT: .cfi_undefined 1650 +; WAVE32-NEXT: .cfi_undefined 1651 +; WAVE32-NEXT: .cfi_undefined 1652 +; WAVE32-NEXT: .cfi_undefined 1653 +; WAVE32-NEXT: .cfi_undefined 1654 +; WAVE32-NEXT: .cfi_undefined 1655 +; WAVE32-NEXT: .cfi_undefined 1664 +; WAVE32-NEXT: .cfi_undefined 1665 +; WAVE32-NEXT: .cfi_undefined 1666 +; WAVE32-NEXT: .cfi_undefined 1667 +; WAVE32-NEXT: .cfi_undefined 1668 +; WAVE32-NEXT: .cfi_undefined 1669 +; WAVE32-NEXT: .cfi_undefined 1670 +; WAVE32-NEXT: .cfi_undefined 1671 +; WAVE32-NEXT: .cfi_undefined 1680 +; WAVE32-NEXT: .cfi_undefined 1681 +; WAVE32-NEXT: .cfi_undefined 1682 +; WAVE32-NEXT: .cfi_undefined 1683 +; WAVE32-NEXT: .cfi_undefined 1684 +; WAVE32-NEXT: .cfi_undefined 1685 +; WAVE32-NEXT: .cfi_undefined 1686 +; WAVE32-NEXT: .cfi_undefined 1687 +; WAVE32-NEXT: .cfi_undefined 1696 +; WAVE32-NEXT: .cfi_undefined 1697 +; WAVE32-NEXT: .cfi_undefined 1698 +; WAVE32-NEXT: .cfi_undefined 1699 +; WAVE32-NEXT: .cfi_undefined 1700 +; WAVE32-NEXT: .cfi_undefined 1701 +; WAVE32-NEXT: .cfi_undefined 1702 +; WAVE32-NEXT: .cfi_undefined 1703 +; WAVE32-NEXT: .cfi_undefined 1712 +; WAVE32-NEXT: .cfi_undefined 1713 +; WAVE32-NEXT: .cfi_undefined 1714 +; WAVE32-NEXT: .cfi_undefined 1715 +; WAVE32-NEXT: .cfi_undefined 1716 +; WAVE32-NEXT: .cfi_undefined 1717 +; WAVE32-NEXT: .cfi_undefined 1718 +; WAVE32-NEXT: .cfi_undefined 1719 +; WAVE32-NEXT: .cfi_undefined 1728 +; WAVE32-NEXT: .cfi_undefined 1729 +; WAVE32-NEXT: .cfi_undefined 1730 +; WAVE32-NEXT: .cfi_undefined 1731 +; WAVE32-NEXT: .cfi_undefined 1732 +; WAVE32-NEXT: .cfi_undefined 1733 +; WAVE32-NEXT: .cfi_undefined 1734 +; WAVE32-NEXT: .cfi_undefined 1735 +; WAVE32-NEXT: .cfi_undefined 1744 +; WAVE32-NEXT: .cfi_undefined 1745 +; WAVE32-NEXT: .cfi_undefined 1746 +; WAVE32-NEXT: .cfi_undefined 1747 +; WAVE32-NEXT: .cfi_undefined 1748 +; WAVE32-NEXT: .cfi_undefined 1749 +; WAVE32-NEXT: .cfi_undefined 1750 +; WAVE32-NEXT: .cfi_undefined 1751 +; WAVE32-NEXT: .cfi_undefined 1760 +; WAVE32-NEXT: .cfi_undefined 1761 +; WAVE32-NEXT: .cfi_undefined 1762 +; WAVE32-NEXT: .cfi_undefined 1763 +; WAVE32-NEXT: .cfi_undefined 1764 +; WAVE32-NEXT: .cfi_undefined 1765 +; WAVE32-NEXT: .cfi_undefined 1766 +; WAVE32-NEXT: .cfi_undefined 1767 +; WAVE32-NEXT: .cfi_undefined 1776 +; WAVE32-NEXT: .cfi_undefined 1777 +; WAVE32-NEXT: .cfi_undefined 1778 +; WAVE32-NEXT: .cfi_undefined 1779 +; WAVE32-NEXT: .cfi_undefined 1780 +; WAVE32-NEXT: .cfi_undefined 1781 +; WAVE32-NEXT: .cfi_undefined 1782 +; WAVE32-NEXT: .cfi_undefined 1783 +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: v_mov_b32_e32 v0, exec_lo +; WAVE32-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_offset 1, 0 +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber all VGPRs +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber all VGPRs", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} @@ -168,25 +1475,1068 @@ define internal void @caller_needs_to_spill_pc_to_memory() #3 { ret void } +define void @need_to_spill_pc_to_mem() #3 { ; WAVE64-LABEL: need_to_spill_pc_to_mem: -; WAVE64: s_mov_b64 exec, 3 -; WAVE64-NEXT: buffer_store_dword [[TEMP_VGPR:v[0-9]+]] -; WAVE64-NEXT: v_writelane_b32 [[TEMP_VGPR]], s30, 0 -; WAVE64-NEXT: v_writelane_b32 [[TEMP_VGPR]], s31, 1 -; WAVE64-NEXT: buffer_store_dword [[TEMP_VGPR]], off, s[0:3], s33 offset: -; WAVE64-NEXT: .cfi_offset 16, -; WAVE64-NEXT: buffer_load_dword [[TEMP_VGPR]] - +; WAVE64: .Lfunc_begin7: +; WAVE64-NEXT: .cfi_startproc +; WAVE64-NEXT: ; %bb.0: +; WAVE64-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE64-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE64-NEXT: .cfi_undefined 2560 +; WAVE64-NEXT: .cfi_undefined 2561 +; WAVE64-NEXT: .cfi_undefined 2562 +; WAVE64-NEXT: .cfi_undefined 2563 +; WAVE64-NEXT: .cfi_undefined 2564 +; WAVE64-NEXT: .cfi_undefined 2565 +; WAVE64-NEXT: .cfi_undefined 2566 +; WAVE64-NEXT: .cfi_undefined 2567 +; WAVE64-NEXT: .cfi_undefined 2568 +; WAVE64-NEXT: .cfi_undefined 2569 +; WAVE64-NEXT: .cfi_undefined 2570 +; WAVE64-NEXT: .cfi_undefined 2571 +; WAVE64-NEXT: .cfi_undefined 2572 +; WAVE64-NEXT: .cfi_undefined 2573 +; WAVE64-NEXT: .cfi_undefined 2574 +; WAVE64-NEXT: .cfi_undefined 2575 +; WAVE64-NEXT: .cfi_undefined 2576 +; WAVE64-NEXT: .cfi_undefined 2577 +; WAVE64-NEXT: .cfi_undefined 2578 +; WAVE64-NEXT: .cfi_undefined 2579 +; WAVE64-NEXT: .cfi_undefined 2580 +; WAVE64-NEXT: .cfi_undefined 2581 +; WAVE64-NEXT: .cfi_undefined 2582 +; WAVE64-NEXT: .cfi_undefined 2583 +; WAVE64-NEXT: .cfi_undefined 2584 +; WAVE64-NEXT: .cfi_undefined 2585 +; WAVE64-NEXT: .cfi_undefined 2586 +; WAVE64-NEXT: .cfi_undefined 2587 +; WAVE64-NEXT: .cfi_undefined 2588 +; WAVE64-NEXT: .cfi_undefined 2589 +; WAVE64-NEXT: .cfi_undefined 2590 +; WAVE64-NEXT: .cfi_undefined 2591 +; WAVE64-NEXT: .cfi_undefined 2592 +; WAVE64-NEXT: .cfi_undefined 2593 +; WAVE64-NEXT: .cfi_undefined 2594 +; WAVE64-NEXT: .cfi_undefined 2595 +; WAVE64-NEXT: .cfi_undefined 2596 +; WAVE64-NEXT: .cfi_undefined 2597 +; WAVE64-NEXT: .cfi_undefined 2598 +; WAVE64-NEXT: .cfi_undefined 2599 +; WAVE64-NEXT: .cfi_undefined 2608 +; WAVE64-NEXT: .cfi_undefined 2609 +; WAVE64-NEXT: .cfi_undefined 2610 +; WAVE64-NEXT: .cfi_undefined 2611 +; WAVE64-NEXT: .cfi_undefined 2612 +; WAVE64-NEXT: .cfi_undefined 2613 +; WAVE64-NEXT: .cfi_undefined 2614 +; WAVE64-NEXT: .cfi_undefined 2615 +; WAVE64-NEXT: .cfi_undefined 2624 +; WAVE64-NEXT: .cfi_undefined 2625 +; WAVE64-NEXT: .cfi_undefined 2626 +; WAVE64-NEXT: .cfi_undefined 2627 +; WAVE64-NEXT: .cfi_undefined 2628 +; WAVE64-NEXT: .cfi_undefined 2629 +; WAVE64-NEXT: .cfi_undefined 2630 +; WAVE64-NEXT: .cfi_undefined 2631 +; WAVE64-NEXT: .cfi_undefined 2640 +; WAVE64-NEXT: .cfi_undefined 2641 +; WAVE64-NEXT: .cfi_undefined 2642 +; WAVE64-NEXT: .cfi_undefined 2643 +; WAVE64-NEXT: .cfi_undefined 2644 +; WAVE64-NEXT: .cfi_undefined 2645 +; WAVE64-NEXT: .cfi_undefined 2646 +; WAVE64-NEXT: .cfi_undefined 2647 +; WAVE64-NEXT: .cfi_undefined 2656 +; WAVE64-NEXT: .cfi_undefined 2657 +; WAVE64-NEXT: .cfi_undefined 2658 +; WAVE64-NEXT: .cfi_undefined 2659 +; WAVE64-NEXT: .cfi_undefined 2660 +; WAVE64-NEXT: .cfi_undefined 2661 +; WAVE64-NEXT: .cfi_undefined 2662 +; WAVE64-NEXT: .cfi_undefined 2663 +; WAVE64-NEXT: .cfi_undefined 2672 +; WAVE64-NEXT: .cfi_undefined 2673 +; WAVE64-NEXT: .cfi_undefined 2674 +; WAVE64-NEXT: .cfi_undefined 2675 +; WAVE64-NEXT: .cfi_undefined 2676 +; WAVE64-NEXT: .cfi_undefined 2677 +; WAVE64-NEXT: .cfi_undefined 2678 +; WAVE64-NEXT: .cfi_undefined 2679 +; WAVE64-NEXT: .cfi_undefined 2688 +; WAVE64-NEXT: .cfi_undefined 2689 +; WAVE64-NEXT: .cfi_undefined 2690 +; WAVE64-NEXT: .cfi_undefined 2691 +; WAVE64-NEXT: .cfi_undefined 2692 +; WAVE64-NEXT: .cfi_undefined 2693 +; WAVE64-NEXT: .cfi_undefined 2694 +; WAVE64-NEXT: .cfi_undefined 2695 +; WAVE64-NEXT: .cfi_undefined 2704 +; WAVE64-NEXT: .cfi_undefined 2705 +; WAVE64-NEXT: .cfi_undefined 2706 +; WAVE64-NEXT: .cfi_undefined 2707 +; WAVE64-NEXT: .cfi_undefined 2708 +; WAVE64-NEXT: .cfi_undefined 2709 +; WAVE64-NEXT: .cfi_undefined 2710 +; WAVE64-NEXT: .cfi_undefined 2711 +; WAVE64-NEXT: .cfi_undefined 2720 +; WAVE64-NEXT: .cfi_undefined 2721 +; WAVE64-NEXT: .cfi_undefined 2722 +; WAVE64-NEXT: .cfi_undefined 2723 +; WAVE64-NEXT: .cfi_undefined 2724 +; WAVE64-NEXT: .cfi_undefined 2725 +; WAVE64-NEXT: .cfi_undefined 2726 +; WAVE64-NEXT: .cfi_undefined 2727 +; WAVE64-NEXT: .cfi_undefined 2736 +; WAVE64-NEXT: .cfi_undefined 2737 +; WAVE64-NEXT: .cfi_undefined 2738 +; WAVE64-NEXT: .cfi_undefined 2739 +; WAVE64-NEXT: .cfi_undefined 2740 +; WAVE64-NEXT: .cfi_undefined 2741 +; WAVE64-NEXT: .cfi_undefined 2742 +; WAVE64-NEXT: .cfi_undefined 2743 +; WAVE64-NEXT: .cfi_undefined 2752 +; WAVE64-NEXT: .cfi_undefined 2753 +; WAVE64-NEXT: .cfi_undefined 2754 +; WAVE64-NEXT: .cfi_undefined 2755 +; WAVE64-NEXT: .cfi_undefined 2756 +; WAVE64-NEXT: .cfi_undefined 2757 +; WAVE64-NEXT: .cfi_undefined 2758 +; WAVE64-NEXT: .cfi_undefined 2759 +; WAVE64-NEXT: .cfi_undefined 2768 +; WAVE64-NEXT: .cfi_undefined 2769 +; WAVE64-NEXT: .cfi_undefined 2770 +; WAVE64-NEXT: .cfi_undefined 2771 +; WAVE64-NEXT: .cfi_undefined 2772 +; WAVE64-NEXT: .cfi_undefined 2773 +; WAVE64-NEXT: .cfi_undefined 2774 +; WAVE64-NEXT: .cfi_undefined 2775 +; WAVE64-NEXT: .cfi_undefined 2784 +; WAVE64-NEXT: .cfi_undefined 2785 +; WAVE64-NEXT: .cfi_undefined 2786 +; WAVE64-NEXT: .cfi_undefined 2787 +; WAVE64-NEXT: .cfi_undefined 2788 +; WAVE64-NEXT: .cfi_undefined 2789 +; WAVE64-NEXT: .cfi_undefined 2790 +; WAVE64-NEXT: .cfi_undefined 2791 +; WAVE64-NEXT: .cfi_undefined 2800 +; WAVE64-NEXT: .cfi_undefined 2801 +; WAVE64-NEXT: .cfi_undefined 2802 +; WAVE64-NEXT: .cfi_undefined 2803 +; WAVE64-NEXT: .cfi_undefined 2804 +; WAVE64-NEXT: .cfi_undefined 2805 +; WAVE64-NEXT: .cfi_undefined 2806 +; WAVE64-NEXT: .cfi_undefined 2807 +; WAVE64-NEXT: .cfi_undefined 48 +; WAVE64-NEXT: .cfi_undefined 49 +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: s_mov_b32 s18, s33 +; WAVE64-NEXT: .cfi_register 65, 50 +; WAVE64-NEXT: s_mov_b32 s33, s32 +; WAVE64-NEXT: v_mov_b32_e32 v0, exec_lo +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456 ; 4-byte Folded Spill +; WAVE64-NEXT: v_mov_b32_e32 v0, exec_hi +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:460 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_offset 17, 29184 +; WAVE64-NEXT: .cfi_def_cfa_register 65 +; WAVE64-NEXT: s_addk_i32 s32, 0x7800 +; WAVE64-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 28416 +; WAVE64-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 28160 +; WAVE64-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2602, 32, 17, 64, 27904 +; WAVE64-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2603, 32, 17, 64, 27648 +; WAVE64-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2604, 32, 17, 64, 27392 +; WAVE64-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2605, 32, 17, 64, 27136 +; WAVE64-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2606, 32, 17, 64, 26880 +; WAVE64-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2607, 32, 17, 64, 26624 +; WAVE64-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2616, 32, 17, 64, 26368 +; WAVE64-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2617, 32, 17, 64, 26112 +; WAVE64-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2618, 32, 17, 64, 25856 +; WAVE64-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2619, 32, 17, 64, 25600 +; WAVE64-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2620, 32, 17, 64, 25344 +; WAVE64-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2621, 32, 17, 64, 25088 +; WAVE64-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2622, 32, 17, 64, 24832 +; WAVE64-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2623, 32, 17, 64, 24576 +; WAVE64-NEXT: buffer_store_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2632, 32, 17, 64, 24320 +; WAVE64-NEXT: buffer_store_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2633, 32, 17, 64, 24064 +; WAVE64-NEXT: buffer_store_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2634, 32, 17, 64, 23808 +; WAVE64-NEXT: buffer_store_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2635, 32, 17, 64, 23552 +; WAVE64-NEXT: buffer_store_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2636, 32, 17, 64, 23296 +; WAVE64-NEXT: buffer_store_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2637, 32, 17, 64, 23040 +; WAVE64-NEXT: buffer_store_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2638, 32, 17, 64, 22784 +; WAVE64-NEXT: buffer_store_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2639, 32, 17, 64, 22528 +; WAVE64-NEXT: buffer_store_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2648, 32, 17, 64, 22272 +; WAVE64-NEXT: buffer_store_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2649, 32, 17, 64, 22016 +; WAVE64-NEXT: buffer_store_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2650, 32, 17, 64, 21760 +; WAVE64-NEXT: buffer_store_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2651, 32, 17, 64, 21504 +; WAVE64-NEXT: buffer_store_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2652, 32, 17, 64, 21248 +; WAVE64-NEXT: buffer_store_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2653, 32, 17, 64, 20992 +; WAVE64-NEXT: buffer_store_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2654, 32, 17, 64, 20736 +; WAVE64-NEXT: buffer_store_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2655, 32, 17, 64, 20480 +; WAVE64-NEXT: buffer_store_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2664, 32, 17, 64, 20224 +; WAVE64-NEXT: buffer_store_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2665, 32, 17, 64, 19968 +; WAVE64-NEXT: buffer_store_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2666, 32, 17, 64, 19712 +; WAVE64-NEXT: buffer_store_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2667, 32, 17, 64, 19456 +; WAVE64-NEXT: buffer_store_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2668, 32, 17, 64, 19200 +; WAVE64-NEXT: buffer_store_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2669, 32, 17, 64, 18944 +; WAVE64-NEXT: buffer_store_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2670, 32, 17, 64, 18688 +; WAVE64-NEXT: buffer_store_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2671, 32, 17, 64, 18432 +; WAVE64-NEXT: buffer_store_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2680, 32, 17, 64, 18176 +; WAVE64-NEXT: buffer_store_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2681, 32, 17, 64, 17920 +; WAVE64-NEXT: buffer_store_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2682, 32, 17, 64, 17664 +; WAVE64-NEXT: buffer_store_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2683, 32, 17, 64, 17408 +; WAVE64-NEXT: buffer_store_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2684, 32, 17, 64, 17152 +; WAVE64-NEXT: buffer_store_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2685, 32, 17, 64, 16896 +; WAVE64-NEXT: buffer_store_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2686, 32, 17, 64, 16640 +; WAVE64-NEXT: buffer_store_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2687, 32, 17, 64, 16384 +; WAVE64-NEXT: buffer_store_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2696, 32, 17, 64, 16128 +; WAVE64-NEXT: buffer_store_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2697, 32, 17, 64, 15872 +; WAVE64-NEXT: buffer_store_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2698, 32, 17, 64, 15616 +; WAVE64-NEXT: buffer_store_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2699, 32, 17, 64, 15360 +; WAVE64-NEXT: buffer_store_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2700, 32, 17, 64, 15104 +; WAVE64-NEXT: buffer_store_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2701, 32, 17, 64, 14848 +; WAVE64-NEXT: buffer_store_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2702, 32, 17, 64, 14592 +; WAVE64-NEXT: buffer_store_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2703, 32, 17, 64, 14336 +; WAVE64-NEXT: buffer_store_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2712, 32, 17, 64, 14080 +; WAVE64-NEXT: buffer_store_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2713, 32, 17, 64, 13824 +; WAVE64-NEXT: buffer_store_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2714, 32, 17, 64, 13568 +; WAVE64-NEXT: buffer_store_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2715, 32, 17, 64, 13312 +; WAVE64-NEXT: buffer_store_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2716, 32, 17, 64, 13056 +; WAVE64-NEXT: buffer_store_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2717, 32, 17, 64, 12800 +; WAVE64-NEXT: buffer_store_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2718, 32, 17, 64, 12544 +; WAVE64-NEXT: buffer_store_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2719, 32, 17, 64, 12288 +; WAVE64-NEXT: buffer_store_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2728, 32, 17, 64, 12032 +; WAVE64-NEXT: buffer_store_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2729, 32, 17, 64, 11776 +; WAVE64-NEXT: buffer_store_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2730, 32, 17, 64, 11520 +; WAVE64-NEXT: buffer_store_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2731, 32, 17, 64, 11264 +; WAVE64-NEXT: buffer_store_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2732, 32, 17, 64, 11008 +; WAVE64-NEXT: buffer_store_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2733, 32, 17, 64, 10752 +; WAVE64-NEXT: buffer_store_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2734, 32, 17, 64, 10496 +; WAVE64-NEXT: buffer_store_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2735, 32, 17, 64, 10240 +; WAVE64-NEXT: buffer_store_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2744, 32, 17, 64, 9984 +; WAVE64-NEXT: buffer_store_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2745, 32, 17, 64, 9728 +; WAVE64-NEXT: buffer_store_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2746, 32, 17, 64, 9472 +; WAVE64-NEXT: buffer_store_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2747, 32, 17, 64, 9216 +; WAVE64-NEXT: buffer_store_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2748, 32, 17, 64, 8960 +; WAVE64-NEXT: buffer_store_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2749, 32, 17, 64, 8704 +; WAVE64-NEXT: buffer_store_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2750, 32, 17, 64, 8448 +; WAVE64-NEXT: buffer_store_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2751, 32, 17, 64, 8192 +; WAVE64-NEXT: buffer_store_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2760, 32, 17, 64, 7936 +; WAVE64-NEXT: buffer_store_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2761, 32, 17, 64, 7680 +; WAVE64-NEXT: buffer_store_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2762, 32, 17, 64, 7424 +; WAVE64-NEXT: buffer_store_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2763, 32, 17, 64, 7168 +; WAVE64-NEXT: buffer_store_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2764, 32, 17, 64, 6912 +; WAVE64-NEXT: buffer_store_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2765, 32, 17, 64, 6656 +; WAVE64-NEXT: buffer_store_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2766, 32, 17, 64, 6400 +; WAVE64-NEXT: buffer_store_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2767, 32, 17, 64, 6144 +; WAVE64-NEXT: buffer_store_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2776, 32, 17, 64, 5888 +; WAVE64-NEXT: buffer_store_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2777, 32, 17, 64, 5632 +; WAVE64-NEXT: buffer_store_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2778, 32, 17, 64, 5376 +; WAVE64-NEXT: buffer_store_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2779, 32, 17, 64, 5120 +; WAVE64-NEXT: buffer_store_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2780, 32, 17, 64, 4864 +; WAVE64-NEXT: buffer_store_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2781, 32, 17, 64, 4608 +; WAVE64-NEXT: buffer_store_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2782, 32, 17, 64, 4352 +; WAVE64-NEXT: buffer_store_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2783, 32, 17, 64, 4096 +; WAVE64-NEXT: buffer_store_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2792, 32, 17, 64, 3840 +; WAVE64-NEXT: buffer_store_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2793, 32, 17, 64, 3584 +; WAVE64-NEXT: buffer_store_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2794, 32, 17, 64, 3328 +; WAVE64-NEXT: buffer_store_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2795, 32, 17, 64, 3072 +; WAVE64-NEXT: buffer_store_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2796, 32, 17, 64, 2816 +; WAVE64-NEXT: buffer_store_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2797, 32, 17, 64, 2560 +; WAVE64-NEXT: buffer_store_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2798, 32, 17, 64, 2304 +; WAVE64-NEXT: buffer_store_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2799, 32, 17, 64, 2048 +; WAVE64-NEXT: buffer_store_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2808, 32, 17, 64, 1792 +; WAVE64-NEXT: buffer_store_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2809, 32, 17, 64, 1536 +; WAVE64-NEXT: buffer_store_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2810, 32, 17, 64, 1280 +; WAVE64-NEXT: buffer_store_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2811, 32, 17, 64, 1024 +; WAVE64-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2812, 32, 17, 64, 768 +; WAVE64-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2813, 32, 17, 64, 512 +; WAVE64-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2814, 32, 17, 64, 256 +; WAVE64-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_llvm_vector_offset 2815, 32, 17, 64, 0 +; WAVE64-NEXT: s_mov_b64 s[16:17], exec +; WAVE64-NEXT: s_mov_b64 exec, 3 +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:464 +; WAVE64-NEXT: v_writelane_b32 v0, s30, 0 +; WAVE64-NEXT: v_writelane_b32 v0, s31, 1 +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; WAVE64-NEXT: .cfi_offset 16, 28672 +; WAVE64-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:464 +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: s_mov_b64 exec, s[16:17] +; WAVE64-NEXT: s_getpc_b64 s[16:17] +; WAVE64-NEXT: s_add_u32 s16, s16, caller_needs_to_spill_pc_to_memory@rel32@lo+4 +; WAVE64-NEXT: s_addc_u32 s17, s17, caller_needs_to_spill_pc_to_memory@rel32@hi+12 +; WAVE64-NEXT: s_swappc_b64 s[30:31], s[16:17] +; WAVE64-NEXT: s_mov_b64 s[4:5], exec +; WAVE64-NEXT: s_mov_b64 exec, 3 +; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:464 +; WAVE64-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: v_readlane_b32 s30, v0, 0 +; WAVE64-NEXT: v_readlane_b32 s31, v0, 1 +; WAVE64-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:464 +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: s_mov_b64 exec, s[4:5] +; WAVE64-NEXT: buffer_load_dword v255, off, s[0:3], s33 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload +; WAVE64-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload +; WAVE64-NEXT: s_mov_b32 s32, s33 +; WAVE64-NEXT: .cfi_def_cfa_register 64 +; WAVE64-NEXT: s_mov_b32 s33, s18 +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; ; WAVE32-LABEL: need_to_spill_pc_to_mem: -; WAVE32: s_mov_b32 exec_lo, 3 -; WAVE32-NEXT: buffer_store_dword [[TEMP_VGPR:v[0-9]+]] -; WAVE32-NEXT: v_writelane_b32 [[TEMP_VGPR]], s30, 0 -; WAVE32-NEXT: v_writelane_b32 [[TEMP_VGPR]], s31, 1 -; WAVE32-NEXT: buffer_store_dword [[TEMP_VGPR]], off, s[0:3], s33 offset: -; WAVE32-NEXT: .cfi_offset 16, -; WAVE32-NEXT: buffer_load_dword [[TEMP_VGPR]] - -define void @need_to_spill_pc_to_mem() #3 { +; WAVE32: .Lfunc_begin7: +; WAVE32-NEXT: .cfi_startproc +; WAVE32-NEXT: ; %bb.0: +; WAVE32-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE32-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE32-NEXT: .cfi_undefined 1536 +; WAVE32-NEXT: .cfi_undefined 1537 +; WAVE32-NEXT: .cfi_undefined 1538 +; WAVE32-NEXT: .cfi_undefined 1539 +; WAVE32-NEXT: .cfi_undefined 1540 +; WAVE32-NEXT: .cfi_undefined 1541 +; WAVE32-NEXT: .cfi_undefined 1542 +; WAVE32-NEXT: .cfi_undefined 1543 +; WAVE32-NEXT: .cfi_undefined 1544 +; WAVE32-NEXT: .cfi_undefined 1545 +; WAVE32-NEXT: .cfi_undefined 1546 +; WAVE32-NEXT: .cfi_undefined 1547 +; WAVE32-NEXT: .cfi_undefined 1548 +; WAVE32-NEXT: .cfi_undefined 1549 +; WAVE32-NEXT: .cfi_undefined 1550 +; WAVE32-NEXT: .cfi_undefined 1551 +; WAVE32-NEXT: .cfi_undefined 1552 +; WAVE32-NEXT: .cfi_undefined 1553 +; WAVE32-NEXT: .cfi_undefined 1554 +; WAVE32-NEXT: .cfi_undefined 1555 +; WAVE32-NEXT: .cfi_undefined 1556 +; WAVE32-NEXT: .cfi_undefined 1557 +; WAVE32-NEXT: .cfi_undefined 1558 +; WAVE32-NEXT: .cfi_undefined 1559 +; WAVE32-NEXT: .cfi_undefined 1560 +; WAVE32-NEXT: .cfi_undefined 1561 +; WAVE32-NEXT: .cfi_undefined 1562 +; WAVE32-NEXT: .cfi_undefined 1563 +; WAVE32-NEXT: .cfi_undefined 1564 +; WAVE32-NEXT: .cfi_undefined 1565 +; WAVE32-NEXT: .cfi_undefined 1566 +; WAVE32-NEXT: .cfi_undefined 1567 +; WAVE32-NEXT: .cfi_undefined 1568 +; WAVE32-NEXT: .cfi_undefined 1569 +; WAVE32-NEXT: .cfi_undefined 1570 +; WAVE32-NEXT: .cfi_undefined 1571 +; WAVE32-NEXT: .cfi_undefined 1572 +; WAVE32-NEXT: .cfi_undefined 1573 +; WAVE32-NEXT: .cfi_undefined 1574 +; WAVE32-NEXT: .cfi_undefined 1575 +; WAVE32-NEXT: .cfi_undefined 1584 +; WAVE32-NEXT: .cfi_undefined 1585 +; WAVE32-NEXT: .cfi_undefined 1586 +; WAVE32-NEXT: .cfi_undefined 1587 +; WAVE32-NEXT: .cfi_undefined 1588 +; WAVE32-NEXT: .cfi_undefined 1589 +; WAVE32-NEXT: .cfi_undefined 1590 +; WAVE32-NEXT: .cfi_undefined 1591 +; WAVE32-NEXT: .cfi_undefined 1600 +; WAVE32-NEXT: .cfi_undefined 1601 +; WAVE32-NEXT: .cfi_undefined 1602 +; WAVE32-NEXT: .cfi_undefined 1603 +; WAVE32-NEXT: .cfi_undefined 1604 +; WAVE32-NEXT: .cfi_undefined 1605 +; WAVE32-NEXT: .cfi_undefined 1606 +; WAVE32-NEXT: .cfi_undefined 1607 +; WAVE32-NEXT: .cfi_undefined 1616 +; WAVE32-NEXT: .cfi_undefined 1617 +; WAVE32-NEXT: .cfi_undefined 1618 +; WAVE32-NEXT: .cfi_undefined 1619 +; WAVE32-NEXT: .cfi_undefined 1620 +; WAVE32-NEXT: .cfi_undefined 1621 +; WAVE32-NEXT: .cfi_undefined 1622 +; WAVE32-NEXT: .cfi_undefined 1623 +; WAVE32-NEXT: .cfi_undefined 1632 +; WAVE32-NEXT: .cfi_undefined 1633 +; WAVE32-NEXT: .cfi_undefined 1634 +; WAVE32-NEXT: .cfi_undefined 1635 +; WAVE32-NEXT: .cfi_undefined 1636 +; WAVE32-NEXT: .cfi_undefined 1637 +; WAVE32-NEXT: .cfi_undefined 1638 +; WAVE32-NEXT: .cfi_undefined 1639 +; WAVE32-NEXT: .cfi_undefined 1648 +; WAVE32-NEXT: .cfi_undefined 1649 +; WAVE32-NEXT: .cfi_undefined 1650 +; WAVE32-NEXT: .cfi_undefined 1651 +; WAVE32-NEXT: .cfi_undefined 1652 +; WAVE32-NEXT: .cfi_undefined 1653 +; WAVE32-NEXT: .cfi_undefined 1654 +; WAVE32-NEXT: .cfi_undefined 1655 +; WAVE32-NEXT: .cfi_undefined 1664 +; WAVE32-NEXT: .cfi_undefined 1665 +; WAVE32-NEXT: .cfi_undefined 1666 +; WAVE32-NEXT: .cfi_undefined 1667 +; WAVE32-NEXT: .cfi_undefined 1668 +; WAVE32-NEXT: .cfi_undefined 1669 +; WAVE32-NEXT: .cfi_undefined 1670 +; WAVE32-NEXT: .cfi_undefined 1671 +; WAVE32-NEXT: .cfi_undefined 1680 +; WAVE32-NEXT: .cfi_undefined 1681 +; WAVE32-NEXT: .cfi_undefined 1682 +; WAVE32-NEXT: .cfi_undefined 1683 +; WAVE32-NEXT: .cfi_undefined 1684 +; WAVE32-NEXT: .cfi_undefined 1685 +; WAVE32-NEXT: .cfi_undefined 1686 +; WAVE32-NEXT: .cfi_undefined 1687 +; WAVE32-NEXT: .cfi_undefined 1696 +; WAVE32-NEXT: .cfi_undefined 1697 +; WAVE32-NEXT: .cfi_undefined 1698 +; WAVE32-NEXT: .cfi_undefined 1699 +; WAVE32-NEXT: .cfi_undefined 1700 +; WAVE32-NEXT: .cfi_undefined 1701 +; WAVE32-NEXT: .cfi_undefined 1702 +; WAVE32-NEXT: .cfi_undefined 1703 +; WAVE32-NEXT: .cfi_undefined 1712 +; WAVE32-NEXT: .cfi_undefined 1713 +; WAVE32-NEXT: .cfi_undefined 1714 +; WAVE32-NEXT: .cfi_undefined 1715 +; WAVE32-NEXT: .cfi_undefined 1716 +; WAVE32-NEXT: .cfi_undefined 1717 +; WAVE32-NEXT: .cfi_undefined 1718 +; WAVE32-NEXT: .cfi_undefined 1719 +; WAVE32-NEXT: .cfi_undefined 1728 +; WAVE32-NEXT: .cfi_undefined 1729 +; WAVE32-NEXT: .cfi_undefined 1730 +; WAVE32-NEXT: .cfi_undefined 1731 +; WAVE32-NEXT: .cfi_undefined 1732 +; WAVE32-NEXT: .cfi_undefined 1733 +; WAVE32-NEXT: .cfi_undefined 1734 +; WAVE32-NEXT: .cfi_undefined 1735 +; WAVE32-NEXT: .cfi_undefined 1744 +; WAVE32-NEXT: .cfi_undefined 1745 +; WAVE32-NEXT: .cfi_undefined 1746 +; WAVE32-NEXT: .cfi_undefined 1747 +; WAVE32-NEXT: .cfi_undefined 1748 +; WAVE32-NEXT: .cfi_undefined 1749 +; WAVE32-NEXT: .cfi_undefined 1750 +; WAVE32-NEXT: .cfi_undefined 1751 +; WAVE32-NEXT: .cfi_undefined 1760 +; WAVE32-NEXT: .cfi_undefined 1761 +; WAVE32-NEXT: .cfi_undefined 1762 +; WAVE32-NEXT: .cfi_undefined 1763 +; WAVE32-NEXT: .cfi_undefined 1764 +; WAVE32-NEXT: .cfi_undefined 1765 +; WAVE32-NEXT: .cfi_undefined 1766 +; WAVE32-NEXT: .cfi_undefined 1767 +; WAVE32-NEXT: .cfi_undefined 1776 +; WAVE32-NEXT: .cfi_undefined 1777 +; WAVE32-NEXT: .cfi_undefined 1778 +; WAVE32-NEXT: .cfi_undefined 1779 +; WAVE32-NEXT: .cfi_undefined 1780 +; WAVE32-NEXT: .cfi_undefined 1781 +; WAVE32-NEXT: .cfi_undefined 1782 +; WAVE32-NEXT: .cfi_undefined 1783 +; WAVE32-NEXT: .cfi_undefined 48 +; WAVE32-NEXT: .cfi_undefined 49 +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: s_mov_b32 s18, s33 +; WAVE32-NEXT: .cfi_register 65, 50 +; WAVE32-NEXT: v_mov_b32_e32 v0, exec_lo +; WAVE32-NEXT: s_mov_b32 s33, s32 +; WAVE32-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_offset 1, 14592 +; WAVE32-NEXT: .cfi_def_cfa_register 65 +; WAVE32-NEXT: s_addk_i32 s32, 0x3a00 +; WAVE32-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1576, 32, 1, 32, 14208 +; WAVE32-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1577, 32, 1, 32, 14080 +; WAVE32-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1578, 32, 1, 32, 13952 +; WAVE32-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1579, 32, 1, 32, 13824 +; WAVE32-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1580, 32, 1, 32, 13696 +; WAVE32-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1581, 32, 1, 32, 13568 +; WAVE32-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1582, 32, 1, 32, 13440 +; WAVE32-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1583, 32, 1, 32, 13312 +; WAVE32-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1592, 32, 1, 32, 13184 +; WAVE32-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1593, 32, 1, 32, 13056 +; WAVE32-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1594, 32, 1, 32, 12928 +; WAVE32-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1595, 32, 1, 32, 12800 +; WAVE32-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1596, 32, 1, 32, 12672 +; WAVE32-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1597, 32, 1, 32, 12544 +; WAVE32-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1598, 32, 1, 32, 12416 +; WAVE32-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1599, 32, 1, 32, 12288 +; WAVE32-NEXT: buffer_store_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1608, 32, 1, 32, 12160 +; WAVE32-NEXT: buffer_store_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1609, 32, 1, 32, 12032 +; WAVE32-NEXT: buffer_store_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1610, 32, 1, 32, 11904 +; WAVE32-NEXT: buffer_store_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1611, 32, 1, 32, 11776 +; WAVE32-NEXT: buffer_store_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1612, 32, 1, 32, 11648 +; WAVE32-NEXT: buffer_store_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1613, 32, 1, 32, 11520 +; WAVE32-NEXT: buffer_store_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1614, 32, 1, 32, 11392 +; WAVE32-NEXT: buffer_store_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1615, 32, 1, 32, 11264 +; WAVE32-NEXT: buffer_store_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1624, 32, 1, 32, 11136 +; WAVE32-NEXT: buffer_store_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1625, 32, 1, 32, 11008 +; WAVE32-NEXT: buffer_store_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1626, 32, 1, 32, 10880 +; WAVE32-NEXT: buffer_store_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1627, 32, 1, 32, 10752 +; WAVE32-NEXT: buffer_store_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1628, 32, 1, 32, 10624 +; WAVE32-NEXT: buffer_store_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1629, 32, 1, 32, 10496 +; WAVE32-NEXT: buffer_store_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1630, 32, 1, 32, 10368 +; WAVE32-NEXT: buffer_store_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1631, 32, 1, 32, 10240 +; WAVE32-NEXT: buffer_store_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1640, 32, 1, 32, 10112 +; WAVE32-NEXT: buffer_store_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1641, 32, 1, 32, 9984 +; WAVE32-NEXT: buffer_store_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1642, 32, 1, 32, 9856 +; WAVE32-NEXT: buffer_store_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1643, 32, 1, 32, 9728 +; WAVE32-NEXT: buffer_store_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1644, 32, 1, 32, 9600 +; WAVE32-NEXT: buffer_store_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1645, 32, 1, 32, 9472 +; WAVE32-NEXT: buffer_store_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1646, 32, 1, 32, 9344 +; WAVE32-NEXT: buffer_store_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1647, 32, 1, 32, 9216 +; WAVE32-NEXT: buffer_store_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1656, 32, 1, 32, 9088 +; WAVE32-NEXT: buffer_store_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1657, 32, 1, 32, 8960 +; WAVE32-NEXT: buffer_store_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1658, 32, 1, 32, 8832 +; WAVE32-NEXT: buffer_store_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1659, 32, 1, 32, 8704 +; WAVE32-NEXT: buffer_store_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1660, 32, 1, 32, 8576 +; WAVE32-NEXT: buffer_store_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1661, 32, 1, 32, 8448 +; WAVE32-NEXT: buffer_store_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1662, 32, 1, 32, 8320 +; WAVE32-NEXT: buffer_store_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1663, 32, 1, 32, 8192 +; WAVE32-NEXT: buffer_store_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1672, 32, 1, 32, 8064 +; WAVE32-NEXT: buffer_store_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1673, 32, 1, 32, 7936 +; WAVE32-NEXT: buffer_store_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1674, 32, 1, 32, 7808 +; WAVE32-NEXT: buffer_store_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1675, 32, 1, 32, 7680 +; WAVE32-NEXT: buffer_store_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1676, 32, 1, 32, 7552 +; WAVE32-NEXT: buffer_store_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1677, 32, 1, 32, 7424 +; WAVE32-NEXT: buffer_store_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1678, 32, 1, 32, 7296 +; WAVE32-NEXT: buffer_store_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1679, 32, 1, 32, 7168 +; WAVE32-NEXT: buffer_store_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1688, 32, 1, 32, 7040 +; WAVE32-NEXT: buffer_store_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1689, 32, 1, 32, 6912 +; WAVE32-NEXT: buffer_store_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1690, 32, 1, 32, 6784 +; WAVE32-NEXT: buffer_store_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1691, 32, 1, 32, 6656 +; WAVE32-NEXT: buffer_store_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1692, 32, 1, 32, 6528 +; WAVE32-NEXT: buffer_store_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1693, 32, 1, 32, 6400 +; WAVE32-NEXT: buffer_store_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1694, 32, 1, 32, 6272 +; WAVE32-NEXT: buffer_store_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1695, 32, 1, 32, 6144 +; WAVE32-NEXT: buffer_store_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1704, 32, 1, 32, 6016 +; WAVE32-NEXT: buffer_store_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1705, 32, 1, 32, 5888 +; WAVE32-NEXT: buffer_store_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1706, 32, 1, 32, 5760 +; WAVE32-NEXT: buffer_store_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1707, 32, 1, 32, 5632 +; WAVE32-NEXT: buffer_store_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1708, 32, 1, 32, 5504 +; WAVE32-NEXT: buffer_store_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1709, 32, 1, 32, 5376 +; WAVE32-NEXT: buffer_store_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1710, 32, 1, 32, 5248 +; WAVE32-NEXT: buffer_store_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1711, 32, 1, 32, 5120 +; WAVE32-NEXT: buffer_store_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1720, 32, 1, 32, 4992 +; WAVE32-NEXT: buffer_store_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1721, 32, 1, 32, 4864 +; WAVE32-NEXT: buffer_store_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1722, 32, 1, 32, 4736 +; WAVE32-NEXT: buffer_store_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1723, 32, 1, 32, 4608 +; WAVE32-NEXT: buffer_store_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1724, 32, 1, 32, 4480 +; WAVE32-NEXT: buffer_store_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1725, 32, 1, 32, 4352 +; WAVE32-NEXT: buffer_store_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1726, 32, 1, 32, 4224 +; WAVE32-NEXT: buffer_store_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1727, 32, 1, 32, 4096 +; WAVE32-NEXT: buffer_store_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1736, 32, 1, 32, 3968 +; WAVE32-NEXT: buffer_store_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1737, 32, 1, 32, 3840 +; WAVE32-NEXT: buffer_store_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1738, 32, 1, 32, 3712 +; WAVE32-NEXT: buffer_store_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1739, 32, 1, 32, 3584 +; WAVE32-NEXT: buffer_store_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1740, 32, 1, 32, 3456 +; WAVE32-NEXT: buffer_store_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1741, 32, 1, 32, 3328 +; WAVE32-NEXT: buffer_store_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1742, 32, 1, 32, 3200 +; WAVE32-NEXT: buffer_store_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1743, 32, 1, 32, 3072 +; WAVE32-NEXT: buffer_store_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1752, 32, 1, 32, 2944 +; WAVE32-NEXT: buffer_store_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1753, 32, 1, 32, 2816 +; WAVE32-NEXT: buffer_store_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1754, 32, 1, 32, 2688 +; WAVE32-NEXT: buffer_store_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1755, 32, 1, 32, 2560 +; WAVE32-NEXT: buffer_store_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1756, 32, 1, 32, 2432 +; WAVE32-NEXT: buffer_store_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1757, 32, 1, 32, 2304 +; WAVE32-NEXT: buffer_store_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1758, 32, 1, 32, 2176 +; WAVE32-NEXT: buffer_store_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1759, 32, 1, 32, 2048 +; WAVE32-NEXT: buffer_store_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1768, 32, 1, 32, 1920 +; WAVE32-NEXT: buffer_store_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1769, 32, 1, 32, 1792 +; WAVE32-NEXT: buffer_store_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1770, 32, 1, 32, 1664 +; WAVE32-NEXT: buffer_store_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1771, 32, 1, 32, 1536 +; WAVE32-NEXT: buffer_store_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1772, 32, 1, 32, 1408 +; WAVE32-NEXT: buffer_store_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1773, 32, 1, 32, 1280 +; WAVE32-NEXT: buffer_store_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1774, 32, 1, 32, 1152 +; WAVE32-NEXT: buffer_store_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1775, 32, 1, 32, 1024 +; WAVE32-NEXT: buffer_store_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1784, 32, 1, 32, 896 +; WAVE32-NEXT: buffer_store_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1785, 32, 1, 32, 768 +; WAVE32-NEXT: buffer_store_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1786, 32, 1, 32, 640 +; WAVE32-NEXT: buffer_store_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1787, 32, 1, 32, 512 +; WAVE32-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1788, 32, 1, 32, 384 +; WAVE32-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1789, 32, 1, 32, 256 +; WAVE32-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1790, 32, 1, 32, 128 +; WAVE32-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1791, 32, 1, 32, 0 +; WAVE32-NEXT: s_mov_b32 s16, exec_lo +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, 3 +; WAVE32-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:460 +; WAVE32-NEXT: v_writelane_b32 v0, s30, 0 +; WAVE32-NEXT: v_writelane_b32 v0, s31, 1 +; WAVE32-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_offset 16, 14336 +; WAVE32-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:460 +; WAVE32-NEXT: s_waitcnt vmcnt(0) +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, s16 +; WAVE32-NEXT: s_getpc_b64 s[16:17] +; WAVE32-NEXT: s_add_u32 s16, s16, caller_needs_to_spill_pc_to_memory@rel32@lo+4 +; WAVE32-NEXT: s_addc_u32 s17, s17, caller_needs_to_spill_pc_to_memory@rel32@hi+12 +; WAVE32-NEXT: s_swappc_b64 s[30:31], s[16:17] +; WAVE32-NEXT: s_mov_b32 s4, exec_lo +; WAVE32-NEXT: s_mov_b32 exec_lo, 3 +; WAVE32-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:460 +; WAVE32-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload +; WAVE32-NEXT: s_waitcnt vmcnt(0) +; WAVE32-NEXT: v_readlane_b32 s30, v0, 0 +; WAVE32-NEXT: v_readlane_b32 s31, v0, 1 +; WAVE32-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:460 +; WAVE32-NEXT: s_waitcnt vmcnt(0) +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, s4 +; WAVE32-NEXT: s_clause 0x3e +; WAVE32-NEXT: buffer_load_dword v255, off, s[0:3], s33 +; WAVE32-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:4 +; WAVE32-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:8 +; WAVE32-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:12 +; WAVE32-NEXT: buffer_load_dword v251, off, s[0:3], s33 offset:16 +; WAVE32-NEXT: buffer_load_dword v250, off, s[0:3], s33 offset:20 +; WAVE32-NEXT: buffer_load_dword v249, off, s[0:3], s33 offset:24 +; WAVE32-NEXT: buffer_load_dword v248, off, s[0:3], s33 offset:28 +; WAVE32-NEXT: buffer_load_dword v239, off, s[0:3], s33 offset:32 +; WAVE32-NEXT: buffer_load_dword v238, off, s[0:3], s33 offset:36 +; WAVE32-NEXT: buffer_load_dword v237, off, s[0:3], s33 offset:40 +; WAVE32-NEXT: buffer_load_dword v236, off, s[0:3], s33 offset:44 +; WAVE32-NEXT: buffer_load_dword v235, off, s[0:3], s33 offset:48 +; WAVE32-NEXT: buffer_load_dword v234, off, s[0:3], s33 offset:52 +; WAVE32-NEXT: buffer_load_dword v233, off, s[0:3], s33 offset:56 +; WAVE32-NEXT: buffer_load_dword v232, off, s[0:3], s33 offset:60 +; WAVE32-NEXT: buffer_load_dword v223, off, s[0:3], s33 offset:64 +; WAVE32-NEXT: buffer_load_dword v222, off, s[0:3], s33 offset:68 +; WAVE32-NEXT: buffer_load_dword v221, off, s[0:3], s33 offset:72 +; WAVE32-NEXT: buffer_load_dword v220, off, s[0:3], s33 offset:76 +; WAVE32-NEXT: buffer_load_dword v219, off, s[0:3], s33 offset:80 +; WAVE32-NEXT: buffer_load_dword v218, off, s[0:3], s33 offset:84 +; WAVE32-NEXT: buffer_load_dword v217, off, s[0:3], s33 offset:88 +; WAVE32-NEXT: buffer_load_dword v216, off, s[0:3], s33 offset:92 +; WAVE32-NEXT: buffer_load_dword v207, off, s[0:3], s33 offset:96 +; WAVE32-NEXT: buffer_load_dword v206, off, s[0:3], s33 offset:100 +; WAVE32-NEXT: buffer_load_dword v205, off, s[0:3], s33 offset:104 +; WAVE32-NEXT: buffer_load_dword v204, off, s[0:3], s33 offset:108 +; WAVE32-NEXT: buffer_load_dword v203, off, s[0:3], s33 offset:112 +; WAVE32-NEXT: buffer_load_dword v202, off, s[0:3], s33 offset:116 +; WAVE32-NEXT: buffer_load_dword v201, off, s[0:3], s33 offset:120 +; WAVE32-NEXT: buffer_load_dword v200, off, s[0:3], s33 offset:124 +; WAVE32-NEXT: buffer_load_dword v191, off, s[0:3], s33 offset:128 +; WAVE32-NEXT: buffer_load_dword v190, off, s[0:3], s33 offset:132 +; WAVE32-NEXT: buffer_load_dword v189, off, s[0:3], s33 offset:136 +; WAVE32-NEXT: buffer_load_dword v188, off, s[0:3], s33 offset:140 +; WAVE32-NEXT: buffer_load_dword v187, off, s[0:3], s33 offset:144 +; WAVE32-NEXT: buffer_load_dword v186, off, s[0:3], s33 offset:148 +; WAVE32-NEXT: buffer_load_dword v185, off, s[0:3], s33 offset:152 +; WAVE32-NEXT: buffer_load_dword v184, off, s[0:3], s33 offset:156 +; WAVE32-NEXT: buffer_load_dword v175, off, s[0:3], s33 offset:160 +; WAVE32-NEXT: buffer_load_dword v174, off, s[0:3], s33 offset:164 +; WAVE32-NEXT: buffer_load_dword v173, off, s[0:3], s33 offset:168 +; WAVE32-NEXT: buffer_load_dword v172, off, s[0:3], s33 offset:172 +; WAVE32-NEXT: buffer_load_dword v171, off, s[0:3], s33 offset:176 +; WAVE32-NEXT: buffer_load_dword v170, off, s[0:3], s33 offset:180 +; WAVE32-NEXT: buffer_load_dword v169, off, s[0:3], s33 offset:184 +; WAVE32-NEXT: buffer_load_dword v168, off, s[0:3], s33 offset:188 +; WAVE32-NEXT: buffer_load_dword v159, off, s[0:3], s33 offset:192 +; WAVE32-NEXT: buffer_load_dword v158, off, s[0:3], s33 offset:196 +; WAVE32-NEXT: buffer_load_dword v157, off, s[0:3], s33 offset:200 +; WAVE32-NEXT: buffer_load_dword v156, off, s[0:3], s33 offset:204 +; WAVE32-NEXT: buffer_load_dword v155, off, s[0:3], s33 offset:208 +; WAVE32-NEXT: buffer_load_dword v154, off, s[0:3], s33 offset:212 +; WAVE32-NEXT: buffer_load_dword v153, off, s[0:3], s33 offset:216 +; WAVE32-NEXT: buffer_load_dword v152, off, s[0:3], s33 offset:220 +; WAVE32-NEXT: buffer_load_dword v143, off, s[0:3], s33 offset:224 +; WAVE32-NEXT: buffer_load_dword v142, off, s[0:3], s33 offset:228 +; WAVE32-NEXT: buffer_load_dword v141, off, s[0:3], s33 offset:232 +; WAVE32-NEXT: buffer_load_dword v140, off, s[0:3], s33 offset:236 +; WAVE32-NEXT: buffer_load_dword v139, off, s[0:3], s33 offset:240 +; WAVE32-NEXT: buffer_load_dword v138, off, s[0:3], s33 offset:244 +; WAVE32-NEXT: buffer_load_dword v137, off, s[0:3], s33 offset:248 +; WAVE32-NEXT: s_clause 0x30 +; WAVE32-NEXT: buffer_load_dword v136, off, s[0:3], s33 offset:252 +; WAVE32-NEXT: buffer_load_dword v127, off, s[0:3], s33 offset:256 +; WAVE32-NEXT: buffer_load_dword v126, off, s[0:3], s33 offset:260 +; WAVE32-NEXT: buffer_load_dword v125, off, s[0:3], s33 offset:264 +; WAVE32-NEXT: buffer_load_dword v124, off, s[0:3], s33 offset:268 +; WAVE32-NEXT: buffer_load_dword v123, off, s[0:3], s33 offset:272 +; WAVE32-NEXT: buffer_load_dword v122, off, s[0:3], s33 offset:276 +; WAVE32-NEXT: buffer_load_dword v121, off, s[0:3], s33 offset:280 +; WAVE32-NEXT: buffer_load_dword v120, off, s[0:3], s33 offset:284 +; WAVE32-NEXT: buffer_load_dword v111, off, s[0:3], s33 offset:288 +; WAVE32-NEXT: buffer_load_dword v110, off, s[0:3], s33 offset:292 +; WAVE32-NEXT: buffer_load_dword v109, off, s[0:3], s33 offset:296 +; WAVE32-NEXT: buffer_load_dword v108, off, s[0:3], s33 offset:300 +; WAVE32-NEXT: buffer_load_dword v107, off, s[0:3], s33 offset:304 +; WAVE32-NEXT: buffer_load_dword v106, off, s[0:3], s33 offset:308 +; WAVE32-NEXT: buffer_load_dword v105, off, s[0:3], s33 offset:312 +; WAVE32-NEXT: buffer_load_dword v104, off, s[0:3], s33 offset:316 +; WAVE32-NEXT: buffer_load_dword v95, off, s[0:3], s33 offset:320 +; WAVE32-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:324 +; WAVE32-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:328 +; WAVE32-NEXT: buffer_load_dword v92, off, s[0:3], s33 offset:332 +; WAVE32-NEXT: buffer_load_dword v91, off, s[0:3], s33 offset:336 +; WAVE32-NEXT: buffer_load_dword v90, off, s[0:3], s33 offset:340 +; WAVE32-NEXT: buffer_load_dword v89, off, s[0:3], s33 offset:344 +; WAVE32-NEXT: buffer_load_dword v88, off, s[0:3], s33 offset:348 +; WAVE32-NEXT: buffer_load_dword v79, off, s[0:3], s33 offset:352 +; WAVE32-NEXT: buffer_load_dword v78, off, s[0:3], s33 offset:356 +; WAVE32-NEXT: buffer_load_dword v77, off, s[0:3], s33 offset:360 +; WAVE32-NEXT: buffer_load_dword v76, off, s[0:3], s33 offset:364 +; WAVE32-NEXT: buffer_load_dword v75, off, s[0:3], s33 offset:368 +; WAVE32-NEXT: buffer_load_dword v74, off, s[0:3], s33 offset:372 +; WAVE32-NEXT: buffer_load_dword v73, off, s[0:3], s33 offset:376 +; WAVE32-NEXT: buffer_load_dword v72, off, s[0:3], s33 offset:380 +; WAVE32-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:384 +; WAVE32-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:388 +; WAVE32-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:392 +; WAVE32-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:396 +; WAVE32-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:400 +; WAVE32-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:404 +; WAVE32-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:408 +; WAVE32-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:412 +; WAVE32-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:416 +; WAVE32-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:420 +; WAVE32-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:424 +; WAVE32-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:428 +; WAVE32-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:432 +; WAVE32-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436 +; WAVE32-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 +; WAVE32-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 +; WAVE32-NEXT: s_mov_b32 s32, s33 +; WAVE32-NEXT: .cfi_def_cfa_register 64 +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 s33, s18 +; WAVE32-NEXT: s_waitcnt vmcnt(0) +; WAVE32-NEXT: s_setpc_b64 s[30:31] call void @caller_needs_to_spill_pc_to_memory() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-lower-all.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-lower-all.ll new file mode 100644 index 0000000000000..f30a382a62c6b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-lower-all.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to check if static LDS accesses in kernels without sanitize_address attribute are lowered if +; other kernels in module have sanitize_address attribute. +@lds_1 = internal addrspace(3) global [1 x i8] poison, align 4 +@lds_2 = internal addrspace(3) global [1 x i32] poison, align 8 + +define amdgpu_kernel void @k0() sanitize_address { +; CHECK-LABEL: define amdgpu_kernel void @k0( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB20:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 33 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 31) +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 68 +; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr addrspace(1) [[TMP18]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP19]], i64 28) +; CHECK-NEXT: br label %[[BB20]] +; CHECK: [[BB20]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr addrspace(3) [[TMP23]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP21]], i32 [[TMP26]] +; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP27]], align 4 +; CHECK-NEXT: [[TMP28:%.*]] = ptrtoint ptr addrspace(3) [[TMP25]] to i32 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP21]], i32 [[TMP28]] +; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP29]], align 2 +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP30:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[TMP30]] to i64 +; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP32]], i64 [[TMP31]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret void +; + store i8 7, ptr addrspace(3) @lds_1, align 4 + store i32 8, ptr addrspace(3) @lds_2, align 2 + ret void +} + +define amdgpu_kernel void @k1() { +; CHECK-LABEL: define amdgpu_kernel void @k1( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 36 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 28) +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k1.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k1, i32 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(3) [[TMP21]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP22]] +; CHECK-NEXT: store i32 9, ptr addrspace(1) [[TMP23]], align 2 +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP24:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr [[TMP24]] to i64 +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP26]], i64 [[TMP25]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret void +; + store i32 9, ptr addrspace(3) @lds_2, align 2 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-lower-none.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-lower-none.ll new file mode 100644 index 0000000000000..5ce12888babbc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-lower-none.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to check if LDS accesses in kernels without sanitize_address attribute are not lowered +; if all other kernels don't have sanitize_address attribute. +@lds_1 = internal addrspace(3) global [1 x i8] poison, align 4 +@lds_2 = internal addrspace(3) global [1 x i32] poison, align 8 + +;. +; CHECK: @lds_1 = internal addrspace(3) global [1 x i8] poison, align 4 +; CHECK: @lds_2 = internal addrspace(3) global [1 x i32] poison, align 8 +;. +define amdgpu_kernel void @k0() { +; CHECK-LABEL: define amdgpu_kernel void @k0() { +; CHECK-NEXT: store i8 7, ptr addrspace(3) @lds_1, align 4 +; CHECK-NEXT: store i32 8, ptr addrspace(3) @lds_2, align 2 +; CHECK-NEXT: ret void +; + store i8 7, ptr addrspace(3) @lds_1, align 4 + store i32 8, ptr addrspace(3) @lds_2, align 2 + ret void +} + +define amdgpu_kernel void @k1() { +; CHECK-LABEL: define amdgpu_kernel void @k1() { +; CHECK-NEXT: store i32 9, ptr addrspace(3) @lds_2, align 2 +; CHECK-NEXT: ret void +; + store i32 9, ptr addrspace(3) @lds_2, align 2 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: [[META0:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll index ae2bcbbb81b5f..a6e6b84bba304 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll @@ -20,8 +20,12 @@ define void @non_kernel_function() sanitize_address { ; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] -; CHECK-NEXT: [[Y:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr -; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(1) [[TMP10]] to ptr +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr addrspace(1) [[TMP13]] to ptr ; CHECK-NEXT: store i8 5, ptr [[TMP9]], align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll index 3a05f93df35a3..b9b4c90daea87 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel. @@ -28,8 +28,12 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]] -; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP34:%.*]] = addrspacecast ptr addrspace(1) [[TMP33]] to ptr +; CHECK-NEXT: [[TMP35:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP35]] +; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(1) [[TMP36]] to ptr ; CHECK-NEXT: store i8 3, ptr [[TMP16]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP14]] @@ -45,16 +49,16 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP25:%.*]] = and i1 [[TMP21]], [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP25]]) ; CHECK-NEXT: [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 0 -; CHECK-NEXT: br i1 [[TMP27]], label [[ASAN_REPORT:%.*]], label [[TMP30:%.*]], !prof [[PROF2:![0-9]+]] -; CHECK: asan.report: -; CHECK-NEXT: br i1 [[TMP25]], label [[TMP28:%.*]], label [[TMP29:%.*]] -; CHECK: 28: +; CHECK-NEXT: br i1 [[TMP27]], label %[[ASAN_REPORT:.*]], label %[[BB35:.*]], !prof [[PROF2:![0-9]+]] +; CHECK: [[ASAN_REPORT]]: +; CHECK-NEXT: br i1 [[TMP25]], label %[[BB33:.*]], label %[[BB34:.*]] +; CHECK: [[BB33]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7:[0-9]+]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[TMP29]] -; CHECK: 29: -; CHECK-NEXT: br label [[TMP30]] -; CHECK: 30: +; CHECK-NEXT: br label %[[BB34]] +; CHECK: [[BB34]]: +; CHECK-NEXT: br label %[[BB35]] +; CHECK: [[BB35]]: ; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP31]], align 8 ; CHECK-NEXT: ret void ; @@ -67,15 +71,15 @@ define void @use_variables() sanitize_address { define amdgpu_kernel void @k0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @k0( ; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB24:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4 ; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP13]], [[TMP14]] @@ -100,9 +104,9 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 132 ; CHECK-NEXT: [[TMP68:%.*]] = ptrtoint ptr addrspace(1) [[TMP67]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP68]], i64 28) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 24: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[BB24]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 @@ -124,16 +128,16 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP41:%.*]] = and i1 [[TMP37]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP41]]) ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne i64 [[TMP42]], 0 -; CHECK-NEXT: br i1 [[TMP43]], label [[ASAN_REPORT:%.*]], label [[TMP46:%.*]], !prof [[PROF2]] -; CHECK: asan.report: -; CHECK-NEXT: br i1 [[TMP41]], label [[TMP44:%.*]], label [[CONDFREE:%.*]] -; CHECK: 44: +; CHECK-NEXT: br i1 [[TMP43]], label %[[ASAN_REPORT:.*]], label %[[BB46:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT]]: +; CHECK-NEXT: br i1 [[TMP41]], label %[[BB44:.*]], label %[[BB45:.*]] +; CHECK: [[BB44]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[CONDFREE]] -; CHECK: 45: -; CHECK-NEXT: br label [[TMP46]] -; CHECK: 46: +; CHECK-NEXT: br label %[[BB45]] +; CHECK: [[BB45]]: +; CHECK-NEXT: br label %[[BB46]] +; CHECK: [[BB46]]: ; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP31]], align 1 ; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(3) [[TMP18]] to i32 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP47]] @@ -152,16 +156,16 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP59:%.*]] = and i1 [[TMP54]], [[TMP58]] ; CHECK-NEXT: [[TMP60:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP59]]) ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne i64 [[TMP60]], 0 -; CHECK-NEXT: br i1 [[TMP61]], label [[ASAN_REPORT1:%.*]], label [[TMP64:%.*]], !prof [[PROF2]] -; CHECK: asan.report1: -; CHECK-NEXT: br i1 [[TMP59]], label [[TMP62:%.*]], label [[TMP63:%.*]] -; CHECK: 64: +; CHECK-NEXT: br i1 [[TMP61]], label %[[ASAN_REPORT1:.*]], label %[[BB66:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT1]]: +; CHECK-NEXT: br i1 [[TMP59]], label %[[BB64:.*]], label %[[BB65:.*]] +; CHECK: [[BB64]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP83]]) #[[ATTR7]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[TMP63]] -; CHECK: 65: -; CHECK-NEXT: br label [[TMP64]] -; CHECK: 66: +; CHECK-NEXT: br label %[[BB65]] +; CHECK: [[BB65]]: +; CHECK-NEXT: br label %[[BB66]] +; CHECK: [[BB66]]: ; CHECK-NEXT: [[TMP84:%.*]] = ptrtoint ptr addrspace(1) [[TMP82]] to i64 ; CHECK-NEXT: [[TMP85:%.*]] = lshr i64 [[TMP84]], 3 ; CHECK-NEXT: [[TMP69:%.*]] = add i64 [[TMP85]], 2147450880 @@ -174,28 +178,28 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP76:%.*]] = and i1 [[TMP72]], [[TMP75]] ; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP76]]) ; CHECK-NEXT: [[TMP78:%.*]] = icmp ne i64 [[TMP77]], 0 -; CHECK-NEXT: br i1 [[TMP78]], label [[ASAN_REPORT2:%.*]], label [[TMP81:%.*]], !prof [[PROF2]] -; CHECK: asan.report2: -; CHECK-NEXT: br i1 [[TMP76]], label [[TMP79:%.*]], label [[TMP80:%.*]] -; CHECK: 79: +; CHECK-NEXT: br i1 [[TMP78]], label %[[ASAN_REPORT2:.*]], label %[[BB81:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT2]]: +; CHECK-NEXT: br i1 [[TMP76]], label %[[BB79:.*]], label %[[BB80:.*]] +; CHECK: [[BB79]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP84]]) #[[ATTR7]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[TMP80]] -; CHECK: 80: -; CHECK-NEXT: br label [[TMP81]] -; CHECK: 81: +; CHECK-NEXT: br label %[[BB80]] +; CHECK: [[BB80]]: +; CHECK-NEXT: br label %[[BB81]] +; CHECK: [[BB81]]: ; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP48]], align 2 -; CHECK-NEXT: br label [[CONDFREE1:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 ; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @use_variables() diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-all.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-all.ll new file mode 100644 index 0000000000000..4625a7f626f9b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-all.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to check if static LDS is lowered correctly when a non-kernel without sanitize_address attr with LDS accesses is called from +; kernel which has sanitize_address attr. +@lds_1 = internal addrspace(3) global [1 x i8] poison, align 1 +@lds_2 = internal addrspace(3) global [1 x i32] poison, align 2 +@lds_3 = external addrspace(3) global [3 x i8], align 4 +@lds_4 = external addrspace(3) global [4 x i8], align 8 + +;. +; @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]] +; @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 3, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 4, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address +; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address +;. +define void @use_variables() { +; CHECK-LABEL: define void @use_variables() { +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP18]] +; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP18]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP9]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(1) [[TMP10]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = addrspacecast ptr addrspace(1) [[TMP14]] to ptr +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP17]] to ptr +; CHECK-NEXT: store i8 3, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr addrspace(3) [[TMP12]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP19]] +; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP20]], align 8 +; CHECK-NEXT: ret void +; + %X = addrspacecast ptr addrspace(3) @lds_3 to ptr + store i8 3, ptr addrspacecast( ptr addrspace(3) @lds_3 to ptr), align 4 + store i8 3, ptr addrspace(3) @lds_4, align 8 + ret void +} + +define amdgpu_kernel void @k0() sanitize_address { +; CHECK-LABEL: define amdgpu_kernel void @k0( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB24:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 33 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 31) +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 68 +; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr addrspace(1) [[TMP18]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP19]], i64 28) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 99 +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 29) +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 132 +; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 28) +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[BB24]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP28]] +; CHECK-NEXT: call void @use_variables() +; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(3) [[TMP27]] to i32 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP25]], i32 [[TMP30]] +; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP31]], align 1 +; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr addrspace(3) [[TMP29]] to i32 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP25]], i32 [[TMP32]] +; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP33]], align 2 +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP34:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP35:%.*]] = ptrtoint ptr [[TMP34]] to i64 +; CHECK-NEXT: [[TMP36:%.*]] = ptrtoint ptr addrspace(1) [[TMP25]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP36]], i64 [[TMP35]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret void +; + call void @use_variables() + store i8 7, ptr addrspace(3) @lds_1, align 1 + store i32 8, ptr addrspace(3) @lds_2, align 2 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-none.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-none.ll new file mode 100644 index 0000000000000..5dbab5643b929 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-none.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to check if LDS is not lowered when a non-kernel with sanitize_address attr and with LDS accesses is called from +; kernel which doesn't have sanitize_address attr. +@lds_1 = internal addrspace(3) global [1 x i8] poison, align 1 +@lds_2 = internal addrspace(3) global [1 x i32] poison, align 2 +@lds_3 = external addrspace(3) global [3 x i8], align 4 +@lds_4 = external addrspace(3) global [4 x i8], align 8 + +;. +; CHECK: @lds_1 = internal addrspace(3) global [1 x i8] poison, align 1 +; CHECK: @lds_2 = internal addrspace(3) global [1 x i32] poison, align 2 +; CHECK: @lds_3 = external addrspace(3) global [3 x i8], align 4 +; CHECK: @lds_4 = external addrspace(3) global [4 x i8], align 8 +;. +define void @use_variables() sanitize_address { +; CHECK-LABEL: define void @use_variables( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) @lds_3 to ptr +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(3) @lds_3 to ptr +; CHECK-NEXT: store i8 3, ptr [[TMP1]], align 4 +; CHECK-NEXT: store i8 3, ptr addrspace(3) @lds_4, align 8 +; CHECK-NEXT: ret void +; + %X = addrspacecast ptr addrspace(3) @lds_3 to ptr + store i8 3, ptr addrspacecast( ptr addrspace(3) @lds_3 to ptr), align 4 + store i8 3, ptr addrspace(3) @lds_4, align 8 + ret void +} + +define amdgpu_kernel void @k0() { +; CHECK-LABEL: define amdgpu_kernel void @k0() { +; CHECK-NEXT: call void @use_variables() +; CHECK-NEXT: store i8 7, ptr addrspace(3) @lds_1, align 1 +; CHECK-NEXT: store i32 8, ptr addrspace(3) @lds_2, align 2 +; CHECK-NEXT: ret void +; + call void @use_variables() + store i8 7, ptr addrspace(3) @lds_1, align 1 + store i32 8, ptr addrspace(3) @lds_2, align 2 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address } +;. +; CHECK: [[META0:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll index 1dd391ec6321a..255dda562c1ea 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if LDS accesses are lowered correctly when a call is made to nested non-kernel. @@ -6,50 +6,64 @@ @A = external addrspace(3) global [8 x ptr] @B = external addrspace(3) global [0 x i32] +;. +; @llvm.amdgcn.sw.lds.kernel_0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_0.md.type { %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0]] +; @llvm.amdgcn.sw.lds.kernel_2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_2.md.type { %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_1.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_1.md.type { %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_3 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_3.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1]] +; @llvm.amdgcn.sw.lds.kernel_3.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_3.md.type { %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [4 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3], no_sanitize_address +; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [4 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0)], [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_3.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0)]], no_sanitize_address +;. define amdgpu_kernel void @kernel_0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_0( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP14]]) -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8 -; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 24) -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 -; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 96 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 32) +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 ; CHECK-NEXT: call void @call_store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @call_store_A() @@ -58,56 +72,56 @@ define amdgpu_kernel void @kernel_0() sanitize_address { define amdgpu_kernel void @kernel_1() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_1( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4 -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4 -; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] -; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15 -; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4 -; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 1), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 3 -; CHECK-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4 -; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP21]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP9]], i64 15 +; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 1), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP17]] to i64 ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP16]], i64 [[TMP18]]) -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 -; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP22]], i64 24) +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_1.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 -; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP24]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -116,48 +130,48 @@ define amdgpu_kernel void @kernel_1() sanitize_address { define amdgpu_kernel void @kernel_2() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_2( -; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP14]]) -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8 -; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 24) -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 -; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 96 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 32) +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 ; CHECK-NEXT: call void @store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @store_A() @@ -166,56 +180,56 @@ define amdgpu_kernel void @kernel_2() sanitize_address { define amdgpu_kernel void @kernel_3() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_3( -; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META6:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4 -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4 -; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] -; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15 -; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4 -; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 1), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 3 -; CHECK-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4 -; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP21]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP9]], i64 15 +; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 1), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP17]] to i64 ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP16]], i64 [[TMP18]]) -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 -; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP22]], i64 24) +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_3.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 -; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP24]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -237,14 +251,16 @@ define private void @store_A() sanitize_address { ; CHECK-SAME: ) #[[ATTR2]] { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: store ptr [[TMP10]], ptr null, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(1) [[TMP10]] to ptr +; CHECK-NEXT: store ptr [[TMP11]], ptr null, align 8 ; CHECK-NEXT: ret void ; store ptr addrspacecast (ptr addrspace(3) @A to ptr), ptr null @@ -256,14 +272,16 @@ define private ptr @get_B_ptr() sanitize_address { ; CHECK-SAME: ) #[[ATTR2]] { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: ret ptr [[TMP10]] +; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(1) [[TMP10]] to ptr +; CHECK-NEXT: ret ptr [[TMP11]] ; ret ptr addrspacecast (ptr addrspace(3) @B to ptr) } @@ -272,8 +290,6 @@ define private ptr @get_B_ptr() sanitize_address { !0 = !{i32 4, !"nosanitize_address", i32 1} ;. -; CHECK: [[META2]] = !{i32 0} -; CHECK: [[META3]] = !{i32 1} -; CHECK: [[META4]] = !{i32 2} -; CHECK: [[META5]] = !{i32 3} +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll index ed9107764eb91..7184ebbb8faa3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if LDS accesses are lowered correctly when a call is made to nested non-kernel. @@ -6,18 +6,32 @@ @A = external addrspace(3) global [8 x ptr] @B = external addrspace(3) global [0 x i32] +;. +; @llvm.amdgcn.sw.lds.kernel_2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_2.md.type { %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_1.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_1.md.type { %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_3 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_3.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1]] +; @llvm.amdgcn.sw.lds.kernel_3.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_3.md.type { %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0]] +; @llvm.amdgcn.sw.lds.kernel_0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_0.md.type { %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [4 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3], no_sanitize_address +; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [4 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0)], [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_3.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0)]], no_sanitize_address +;. define amdgpu_kernel void @kernel_0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_0( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4 ; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] @@ -33,23 +47,23 @@ define amdgpu_kernel void @kernel_0() sanitize_address { ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 ; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 ; CHECK-NEXT: call void @call_store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @call_store_A() @@ -58,16 +72,16 @@ define amdgpu_kernel void @kernel_0() sanitize_address { define amdgpu_kernel void @kernel_1() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_1( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4 ; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4 ; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] @@ -90,24 +104,24 @@ define amdgpu_kernel void @kernel_1() sanitize_address { ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 ; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_1.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 ; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -116,16 +130,16 @@ define amdgpu_kernel void @kernel_1() sanitize_address { define amdgpu_kernel void @kernel_2() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_2( -; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4 ; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] @@ -141,23 +155,23 @@ define amdgpu_kernel void @kernel_2() sanitize_address { ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 ; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 ; CHECK-NEXT: call void @store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @store_A() @@ -166,16 +180,16 @@ define amdgpu_kernel void @kernel_2() sanitize_address { define amdgpu_kernel void @kernel_3() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_3( -; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META6:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4 ; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4 ; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] @@ -198,24 +212,24 @@ define amdgpu_kernel void @kernel_3() sanitize_address { ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 ; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_3.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 ; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -243,7 +257,9 @@ define private void @store_A() sanitize_address { ; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP12]] to ptr ; CHECK-NEXT: store ptr [[TMP10]], ptr null, align 8 ; CHECK-NEXT: ret void ; @@ -262,7 +278,9 @@ define private ptr @get_B_ptr() sanitize_address { ; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP12]] to ptr ; CHECK-NEXT: ret ptr [[TMP10]] ; ret ptr addrspacecast (ptr addrspace(3) @B to ptr) @@ -272,8 +290,6 @@ define private ptr @get_B_ptr() sanitize_address { !0 = !{i32 4, !"nosanitize_address", i32 1} ;. -; CHECK: [[META2]] = !{i32 0} -; CHECK: [[META3]] = !{i32 1} -; CHECK: [[META4]] = !{i32 2} -; CHECK: [[META5]] = !{i32 3} +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll index b9fa89dd6f0a6..704bc9e635294 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll @@ -29,8 +29,12 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP9]], align 8 ; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(1) [[TMP10]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP11]] -; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr -; CHECK-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr +; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = addrspacecast ptr addrspace(1) [[TMP19]] to ptr +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(1) [[TMP17]] to ptr ; CHECK-NEXT: store i8 3, ptr [[TMP13]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP12]] to i32 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP14]] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll index 11e912287c7f7..8f5abe962f8eb 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel. @@ -28,8 +28,12 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]] -; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = addrspacecast ptr addrspace(1) [[TMP18]] to ptr +; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP20]] +; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(1) [[TMP17]] to ptr ; CHECK-NEXT: store i8 3, ptr [[TMP16]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP14]] @@ -44,16 +48,16 @@ define void @use_variables() sanitize_address { define amdgpu_kernel void @k0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @k0( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB24:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4 ; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP13]], [[TMP14]] @@ -78,9 +82,9 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 132 ; CHECK-NEXT: [[TMP68:%.*]] = ptrtoint ptr addrspace(1) [[TMP67]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP68]], i64 28) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 24: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[BB24]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 @@ -94,17 +98,17 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(3) [[TMP18]] to i32 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP47]] ; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP48]], align 2 -; CHECK-NEXT: br label [[CONDFREE1:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 ; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @use_variables() @@ -124,5 +128,6 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } ;. ; CHECK: [[META0]] = !{i32 0, i32 1} -; CHECK: [[META1]] = !{i32 0} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +; CHECK: [[META2]] = !{i32 0} ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll new file mode 100644 index 0000000000000..1973a0acf4659 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s +@lds = internal addrspace(3) global [5 x i32] poison, align 16 + +;. +; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 16, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 20, i32 64 } }, no_sanitize_address +;. +define amdgpu_kernel void @k0() sanitize_address { +; CHECK-LABEL: define amdgpu_kernel void @k0( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 52 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 44) +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(3) [[TMP21]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = addrspacecast ptr addrspace(1) [[TMP23]] to ptr +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [5 x i32], ptr [[TMP24]], i64 0, i64 0 +; CHECK-NEXT: store i32 1, ptr [[GEP]], align 4 +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret void +; + %gep = getelementptr inbounds [5 x i32], ptr addrspacecast (ptr addrspace(3) @lds to ptr), i64 0, i64 0 + store i32 1, ptr %gep, align 4 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="16" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +;. +; CHECK: [[META0]] = !{i32 0, i32 1} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll new file mode 100644 index 0000000000000..34caf91def933 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to check if vector of static LDS ptrs accesses in kernel are lowered correctly. +@lds_var1 = internal addrspace(3) global i32 poison +@lds_var2 = internal addrspace(3) global i32 poison + +;. +; CHECK: @llvm.amdgcn.sw.lds.example = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @llvm.amdgcn.sw.lds.example.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.example.md.type { %llvm.amdgcn.sw.lds.example.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.example.md.item { i32 32, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.example.md.item { i32 64, i32 4, i32 32 } }, no_sanitize_address +;. +define amdgpu_kernel void @example() sanitize_address { +; CHECK-LABEL: define amdgpu_kernel void @example( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[ENTRY:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.example, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 36 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 28) +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 68 +; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr addrspace(1) [[TMP18]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP19]], i64 28) +; CHECK-NEXT: br label %[[ENTRY]] +; CHECK: [[ENTRY]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.example, align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.example, i32 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.example, i32 [[TMP23]] +; CHECK-NEXT: [[VEC_LDS_PTRS:%.*]] = insertelement <2 x ptr addrspace(3)> poison, ptr addrspace(3) [[TMP22]], i32 0 +; CHECK-NEXT: [[VEC_LDS_PTRS1:%.*]] = insertelement <2 x ptr addrspace(3)> [[VEC_LDS_PTRS]], ptr addrspace(3) [[TMP24]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[VEC_LDS_PTRS1]] to <2 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], <2 x i32> [[TMP25]] +; CHECK-NEXT: [[TMP32:%.*]] = addrspacecast <2 x ptr addrspace(1)> [[TMP31]] to <2 x ptr> +; CHECK-NEXT: [[ELEM0:%.*]] = extractelement <2 x ptr> [[TMP32]], i32 0 +; CHECK-NEXT: store i32 42, ptr [[ELEM0]], align 4 +; CHECK-NEXT: [[ELEM1:%.*]] = extractelement <2 x ptr> [[TMP32]], i32 1 +; CHECK-NEXT: store i32 43, ptr [[ELEM1]], align 4 +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP33:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP34:%.*]] = ptrtoint ptr [[TMP33]] to i64 +; CHECK-NEXT: [[TMP35:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP35]], i64 [[TMP34]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret void +; +entry: + ; Create a vector of flat pointers + %vec_lds_ptrs = insertelement <2 x ptr addrspace(3)> poison, ptr addrspace(3) @lds_var1, i32 0 + %vec_lds_ptrs1 = insertelement <2 x ptr addrspace(3)> %vec_lds_ptrs, ptr addrspace(3) @lds_var2, i32 1 + %vec_flat_ptrs = addrspacecast <2 x ptr addrspace(3)> %vec_lds_ptrs1 to <2 x ptr> + %elem0 = extractelement <2 x ptr> %vec_flat_ptrs, i32 0 + store i32 42, ptr %elem0, align 4 + %elem1 = extractelement <2 x ptr> %vec_flat_ptrs, i32 1 + store i32 43, ptr %elem1, align 4 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +;. +; CHECK: [[META0]] = !{i32 0, i32 1} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index ea3f08ede2c5d..3c285b67e2a6d 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=AKF_HSA %s ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=ATTRIBUTOR_HSA %s ; TODO: The test contains UB which is refined by the Attributor and should be removed. @@ -19,197 +18,130 @@ declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #0 declare i64 @llvm.amdgcn.dispatch.id() #0 define void @use_workitem_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_x -; AKF_HSA-SAME: () #[[ATTR1:[0-9]+]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workitem.id.x() - store volatile i32 %val, ptr addrspace(1) undef + store volatile i32 %val, ptr addrspace(1) poison ret void } define void @use_workitem_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workitem.id.y() - store volatile i32 %val, ptr addrspace(1) undef + store volatile i32 %val, ptr addrspace(1) poison ret void } define void @use_workitem_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR3:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workitem.id.z() - store volatile i32 %val, ptr addrspace(1) undef + store volatile i32 %val, ptr addrspace(1) poison ret void } define void @use_workgroup_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR4:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workgroup.id.x() - store volatile i32 %val, ptr addrspace(1) undef + store volatile i32 %val, ptr addrspace(1) poison ret void } define void @use_workgroup_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR5:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workgroup.id.y() - store volatile i32 %val, ptr addrspace(1) undef + store volatile i32 %val, ptr addrspace(1) poison ret void } define void @use_workgroup_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR6:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workgroup.id.z() - store volatile i32 %val, ptr addrspace(1) undef + store volatile i32 %val, ptr addrspace(1) poison ret void } define void @use_dispatch_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) undef, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR7:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) undef, align 8 +; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) poison, align 8 ; ATTRIBUTOR_HSA-NEXT: ret void ; %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() - store volatile ptr addrspace(4) %dispatch.ptr, ptr addrspace(1) undef + store volatile ptr addrspace(4) %dispatch.ptr, ptr addrspace(1) poison ret void } define void @use_queue_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_queue_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[QUEUE_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[QUEUE_PTR]], ptr addrspace(1) undef, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_queue_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR8:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[QUEUE_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() -; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[QUEUE_PTR]], ptr addrspace(1) undef, align 8 +; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[QUEUE_PTR]], ptr addrspace(1) poison, align 8 ; ATTRIBUTOR_HSA-NEXT: ret void ; %queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() - store volatile ptr addrspace(4) %queue.ptr, ptr addrspace(1) undef + store volatile ptr addrspace(4) %queue.ptr, ptr addrspace(1) poison ret void } define void @use_dispatch_id() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_id -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id() -; AKF_HSA-NEXT: store volatile i64 [[VAL]], ptr addrspace(1) undef, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_id ; ATTRIBUTOR_HSA-SAME: () #[[ATTR9:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id() -; ATTRIBUTOR_HSA-NEXT: store volatile i64 [[VAL]], ptr addrspace(1) undef, align 8 +; ATTRIBUTOR_HSA-NEXT: store volatile i64 [[VAL]], ptr addrspace(1) poison, align 8 ; ATTRIBUTOR_HSA-NEXT: ret void ; %val = call i64 @llvm.amdgcn.dispatch.id() - store volatile i64 %val, ptr addrspace(1) undef + store volatile i64 %val, ptr addrspace(1) poison ret void } define void @use_workgroup_id_y_workgroup_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) undef, align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) undef, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y_workgroup_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR10:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() ; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) undef, align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) undef, align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) poison, align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) poison, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workgroup.id.y() %val1 = call i32 @llvm.amdgcn.workgroup.id.z() - store volatile i32 %val0, ptr addrspace(1) undef - store volatile i32 %val1, ptr addrspace(1) undef + store volatile i32 %val0, ptr addrspace(1) poison + store volatile i32 %val1, ptr addrspace(1) poison ret void } define void @func_indirect_use_workitem_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workitem_id_x() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_x() @@ -220,11 +152,6 @@ define void @func_indirect_use_workitem_id_x() #1 { } define void @kernel_indirect_use_workitem_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workitem_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workitem_id_x() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workitem_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_x() @@ -235,11 +162,6 @@ define void @kernel_indirect_use_workitem_id_x() #1 { } define void @func_indirect_use_workitem_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workitem_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_y() @@ -250,11 +172,6 @@ define void @func_indirect_use_workitem_id_y() #1 { } define void @func_indirect_use_workitem_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workitem_id_z() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR3]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_z() @@ -265,11 +182,6 @@ define void @func_indirect_use_workitem_id_z() #1 { } define void @func_indirect_use_workgroup_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workgroup_id_x() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR4]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_x() @@ -280,11 +192,6 @@ define void @func_indirect_use_workgroup_id_x() #1 { } define void @kernel_indirect_use_workgroup_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workgroup_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workgroup_id_x() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workgroup_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR4]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_x() @@ -295,11 +202,6 @@ define void @kernel_indirect_use_workgroup_id_x() #1 { } define void @func_indirect_use_workgroup_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workgroup_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_y() @@ -310,11 +212,6 @@ define void @func_indirect_use_workgroup_id_y() #1 { } define void @func_indirect_use_workgroup_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workgroup_id_z() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR6]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_z() @@ -325,11 +222,6 @@ define void @func_indirect_use_workgroup_id_z() #1 { } define void @func_indirect_indirect_use_workgroup_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_indirect_use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @func_indirect_use_workgroup_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_indirect_use_workgroup_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_use_workgroup_id_y() @@ -340,11 +232,6 @@ define void @func_indirect_indirect_use_workgroup_id_y() #1 { } define void @indirect_x2_use_workgroup_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@indirect_x2_use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @func_indirect_indirect_use_workgroup_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_x2_use_workgroup_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_indirect_use_workgroup_id_y() @@ -355,11 +242,6 @@ define void @indirect_x2_use_workgroup_id_y() #1 { } define void @func_indirect_use_dispatch_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_dispatch_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_dispatch_ptr() @@ -370,11 +252,6 @@ define void @func_indirect_use_dispatch_ptr() #1 { } define void @func_indirect_use_queue_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_queue_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_queue_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_queue_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_queue_ptr() @@ -385,11 +262,6 @@ define void @func_indirect_use_queue_ptr() #1 { } define void @func_indirect_use_dispatch_id() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_id -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_dispatch_id() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_id ; ATTRIBUTOR_HSA-SAME: () #[[ATTR9]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_dispatch_id() @@ -400,11 +272,6 @@ define void @func_indirect_use_dispatch_id() #1 { } define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @func_indirect_use_workgroup_id_y_workgroup_id_z() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y_workgroup_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR11:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_use_workgroup_id_y_workgroup_id_z() @@ -415,32 +282,20 @@ define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 { } define void @recursive_use_workitem_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@recursive_use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 -; AKF_HSA-NEXT: call void @recursive_use_workitem_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@recursive_use_workitem_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 ; ATTRIBUTOR_HSA-NEXT: call void @recursive_use_workitem_id_y() ; ATTRIBUTOR_HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workitem.id.y() - store volatile i32 %val, ptr addrspace(1) undef + store volatile i32 %val, ptr addrspace(1) poison call void @recursive_use_workitem_id_y() ret void } define void @call_recursive_use_workitem_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@call_recursive_use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @recursive_use_workitem_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@call_recursive_use_workitem_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: call void @recursive_use_workitem_id_y() @@ -451,12 +306,6 @@ define void @call_recursive_use_workitem_id_y() #1 { } define void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast -; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) @@ -470,12 +319,6 @@ define void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 { define void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) %ptr) #2 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_gfx9 -; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_gfx9 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) @@ -488,13 +331,6 @@ define void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) %ptr) #2 { } define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) %ptr) #2 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_queue_ptr_gfx9 -; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR2]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4 -; AKF_HSA-NEXT: call void @func_indirect_use_queue_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_queue_ptr_gfx9 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR14:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) @@ -509,11 +345,6 @@ define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) %pt } define void @indirect_use_group_to_flat_addrspacecast() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast(ptr addrspace(3) null) -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast(ptr addrspace(3) null) @@ -524,11 +355,6 @@ define void @indirect_use_group_to_flat_addrspacecast() #1 { } define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9 -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null) -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9 ; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null) @@ -539,11 +365,6 @@ define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 { } define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9 -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null) -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9 ; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null) @@ -554,28 +375,17 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 { } define void @use_kernarg_segment_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_kernarg_segment_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[KERNARG_SEGMENT_PTR]], ptr addrspace(1) undef, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_kernarg_segment_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] { ; ATTRIBUTOR_HSA-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[KERNARG_SEGMENT_PTR]], ptr addrspace(1) undef, align 8 +; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[KERNARG_SEGMENT_PTR]], ptr addrspace(1) poison, align 8 ; ATTRIBUTOR_HSA-NEXT: ret void ; %kernarg.segment.ptr = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() - store volatile ptr addrspace(4) %kernarg.segment.ptr, ptr addrspace(1) undef + store volatile ptr addrspace(4) %kernarg.segment.ptr, ptr addrspace(1) poison ret void } define void @func_indirect_use_kernarg_segment_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_kernarg_segment_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_kernarg_segment_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_kernarg_segment_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_kernarg_segment_ptr() @@ -586,47 +396,30 @@ define void @func_indirect_use_kernarg_segment_ptr() #1 { } define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8 +; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) poison, align 8 ; ATTRIBUTOR_HSA-NEXT: ret void ; %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - store volatile ptr addrspace(4) %implicitarg.ptr, ptr addrspace(1) undef + store volatile ptr addrspace(4) %implicitarg.ptr, ptr addrspace(1) poison ret void } define void @use_implicitarg_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8 +; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) poison, align 8 ; ATTRIBUTOR_HSA-NEXT: ret void ; %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - store volatile ptr addrspace(4) %implicitarg.ptr, ptr addrspace(1) undef + store volatile ptr addrspace(4) %implicitarg.ptr, ptr addrspace(1) poison ret void } define void @func_indirect_use_implicitarg_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_implicitarg_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_implicitarg_ptr() @@ -640,10 +433,6 @@ declare void @external.func() #3 ; This function gets deleted. define internal void @defined.func() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@defined.func -; AKF_HSA-SAME: () #[[ATTR3:[0-9]+]] { -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func ; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void @@ -652,11 +441,6 @@ define internal void @defined.func() #3 { } define void @func_call_external() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_call_external -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: call void @external.func() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external ; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() @@ -667,11 +451,6 @@ define void @func_call_external() #3 { } define void @func_call_defined() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_call_defined -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: call void @defined.func() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_defined ; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: call void @defined.func() @@ -681,14 +460,9 @@ define void @func_call_defined() #3 { ret void } define void @func_call_asm() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_call_asm -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR3]] -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm ; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] { -; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR26:[0-9]+]] +; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR24:[0-9]+]] ; ATTRIBUTOR_HSA-NEXT: ret void ; call void asm sideeffect "", ""() #3 @@ -696,11 +470,6 @@ define void @func_call_asm() #3 { } define amdgpu_kernel void @kern_call_external() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_external -; AKF_HSA-SAME: () #[[ATTR4:[0-9]+]] { -; AKF_HSA-NEXT: call void @external.func() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external ; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() @@ -711,13 +480,8 @@ define amdgpu_kernel void @kern_call_external() #3 { } define amdgpu_kernel void @func_kern_defined() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_kern_defined -; AKF_HSA-SAME: () #[[ATTR4]] { -; AKF_HSA-NEXT: call void @defined.func() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_kern_defined -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: call void @defined.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -726,30 +490,18 @@ define amdgpu_kernel void @func_kern_defined() #3 { } define i32 @use_dispatch_ptr_ret_type() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr_ret_type -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) undef, align 8 -; AKF_HSA-NEXT: ret i32 0 -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr_ret_type ; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] { ; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) undef, align 8 +; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) poison, align 8 ; ATTRIBUTOR_HSA-NEXT: ret i32 0 ; %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() - store volatile ptr addrspace(4) %dispatch.ptr, ptr addrspace(1) undef + store volatile ptr addrspace(4) %dispatch.ptr, ptr addrspace(1) poison ret i32 0 } define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float @use_dispatch_ptr_ret_type() -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func ; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @use_dispatch_ptr_ret_type() @@ -762,12 +514,6 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 { } define float @func_indirect_call(ptr %fptr) #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_call -; AKF_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR3]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call ; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() @@ -781,12 +527,6 @@ define float @func_indirect_call(ptr %fptr) #3 { declare float @extern() #3 define float @func_extern_call() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_extern_call -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float @extern() -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call ; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @extern() @@ -799,12 +539,6 @@ define float @func_extern_call() #3 { } define float @func_null_call(ptr %fptr) #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_null_call -; AKF_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR3]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float null() -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call ; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float null() @@ -820,12 +554,6 @@ declare float @llvm.amdgcn.rcp.f32(float) #0 ; Calls some other recognized intrinsic define float @func_other_intrinsic_call(float %arg) #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call -; AKF_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR3]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]]) -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call ; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]]) @@ -839,13 +567,8 @@ define float @func_other_intrinsic_call(float %arg) #3 { ; Hostcall needs to be enabled for sanitizers define amdgpu_kernel void @kern_sanitize_address() #4 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address -; AKF_HSA-SAME: () #[[ATTR5:[0-9]+]] { -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -855,13 +578,8 @@ define amdgpu_kernel void @kern_sanitize_address() #4 { ; Hostcall needs to be enabled for sanitizers define void @func_sanitize_address() #4 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_sanitize_address -; AKF_HSA-SAME: () #[[ATTR5]] { -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -871,13 +589,8 @@ define void @func_sanitize_address() #4 { ; Hostcall needs to be enabled for sanitizers define void @func_indirect_sanitize_address() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: call void @func_sanitize_address() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -887,13 +600,8 @@ define void @func_indirect_sanitize_address() #3 { ; Hostcall needs to be enabled for sanitizers define amdgpu_kernel void @kern_indirect_sanitize_address() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address -; AKF_HSA-SAME: () #[[ATTR4]] { -; AKF_HSA-NEXT: call void @func_sanitize_address() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -906,11 +614,6 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 { declare void @extern_func_sanitize_address() #5 define amdgpu_kernel void @kern_decl_sanitize_address() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address -; AKF_HSA-SAME: () #[[ATTR4]] { -; AKF_HSA-NEXT: call void @extern_func_sanitize_address() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address ; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: call void @extern_func_sanitize_address() @@ -923,25 +626,16 @@ define amdgpu_kernel void @kern_decl_sanitize_address() #3 { declare void @enqueue_block_decl() #6 define internal void @enqueue_block_def() #6 { -; AKF_HSA-LABEL: define {{[^@]+}}@enqueue_block_def -; AKF_HSA-SAME: () #[[ATTR7:[0-9]+]] { -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def -; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void } define amdgpu_kernel void @kern_call_enqueued_block_decl() { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl -; AKF_HSA-SAME: () #[[ATTR8:[0-9]+]] { -; AKF_HSA-NEXT: call void @enqueue_block_decl() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl -; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_decl() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -950,13 +644,8 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() { } define amdgpu_kernel void @kern_call_enqueued_block_def() { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def -; AKF_HSA-SAME: () #[[ATTR8]] { -; AKF_HSA-NEXT: call void @enqueue_block_def() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def -; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_def() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -965,22 +654,16 @@ define amdgpu_kernel void @kern_call_enqueued_block_def() { } define void @unused_enqueue_block() { -; AKF_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block() { -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block -; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR23]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void } define internal void @known_func() { -; AKF_HSA-LABEL: define {{[^@]+}}@known_func() { -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@known_func -; ATTRIBUTOR_HSA-SAME: () #[[ATTR25]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR23]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -988,14 +671,9 @@ define internal void @known_func() { ; Should never happen define amdgpu_kernel void @kern_callsite_enqueue_block() { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block -; AKF_HSA-SAME: () #[[ATTR8]] { -; AKF_HSA-NEXT: call void @known_func() #[[ATTR7]] -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block -; ATTRIBUTOR_HSA-SAME: () #[[ATTR24]] { -; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR27:[0-9]+]] +; ATTRIBUTOR_HSA-SAME: () #[[ATTR23]] { +; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR25:[0-9]+]] ; ATTRIBUTOR_HSA-NEXT: ret void ; call void @known_func() #6 @@ -1011,18 +689,6 @@ attributes #5 = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" } attributes #6 = { "enqueued-block" } -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} -;. -; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; AKF_HSA: attributes #[[ATTR1]] = { nounwind "target-cpu"="fiji" } -; AKF_HSA: attributes #[[ATTR2]] = { nounwind "target-cpu"="gfx900" } -; AKF_HSA: attributes #[[ATTR3]] = { nounwind } -; AKF_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-calls" } -; AKF_HSA: attributes #[[ATTR5]] = { nounwind sanitize_address } -; AKF_HSA: attributes #[[ATTR6:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" } -; AKF_HSA: attributes #[[ATTR7]] = { "enqueued-block" } -; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } @@ -1041,19 +707,13 @@ attributes #6 = { "enqueued-block" } ; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR20:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR21:[0-9]+]] = { "enqueued-block" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "enqueued-block" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { nounwind } -; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "enqueued-block" } -;. -; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} -;. -; ATTRIBUTOR_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} +; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR19:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR20:[0-9]+]] = { "enqueued-block" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "enqueued-block" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { nounwind } +; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "enqueued-block" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index 6896ac8d2e5db..6f4b16ec11130 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -1,8 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s -; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefix=HSA %s declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.y() #0 @@ -33,17 +30,11 @@ define amdgpu_kernel void @use_tgid_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_y -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_tgid_y +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { +; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workgroup.id.y() store i32 %val, ptr addrspace(1) %ptr @@ -51,21 +42,13 @@ define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { +; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workgroup.id.y() store volatile i32 %val0, ptr addrspace(1) %ptr @@ -75,21 +58,13 @@ define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_tgid_x_y +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { +; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() @@ -99,17 +74,11 @@ define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_z -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_tgid_z +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { +; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workgroup.id.z() store i32 %val, ptr addrspace(1) %ptr @@ -117,21 +86,13 @@ define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_z -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_tgid_x_z +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] { +; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.z() @@ -141,21 +102,13 @@ define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_y_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_y_z -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_tgid_y_z +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] { +; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workgroup.id.y() %val1 = call i32 @llvm.amdgcn.workgroup.id.z() @@ -165,25 +118,15 @@ define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; ATTRIBUTOR_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4]] { +; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() @@ -207,17 +150,11 @@ define amdgpu_kernel void @use_tidig_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_y -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_tidig_y +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] { +; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workitem.id.y() store i32 %val, ptr addrspace(1) %ptr @@ -225,17 +162,11 @@ define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_z -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR6:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_tidig_z +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR6:[0-9]+]] { +; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workitem.id.z() store i32 %val, ptr addrspace(1) %ptr @@ -259,21 +190,13 @@ define amdgpu_kernel void @use_tidig_x_tgid_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR7:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR7:[0-9]+]] { +; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workitem.id.y() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() @@ -283,25 +206,15 @@ define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR8:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; ATTRIBUTOR_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR8:[0-9]+]] { +; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.y() @@ -313,37 +226,21 @@ define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_all_workitems -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; AKF_HSA-NEXT: [[VAL3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: [[VAL4:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: [[VAL5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_all_workitems -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR9:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; ATTRIBUTOR_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; ATTRIBUTOR_HSA-NEXT: [[VAL3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; ATTRIBUTOR_HSA-NEXT: [[VAL4:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; ATTRIBUTOR_HSA-NEXT: [[VAL5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_all_workitems +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR9:[0-9]+]] { +; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; HSA-NEXT: [[VAL3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; HSA-NEXT: [[VAL4:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; HSA-NEXT: [[VAL5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.y() @@ -361,19 +258,12 @@ define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_dispatch_ptr(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4 -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR10:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR10:[0-9]+]] { +; HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4 +; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %val = load i32, ptr addrspace(4) %dispatch.ptr @@ -382,19 +272,12 @@ define amdgpu_kernel void @use_dispatch_ptr(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_queue_ptr(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_queue_ptr -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() -; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4 -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_queue_ptr -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR11:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() -; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_queue_ptr +; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR11:[0-9]+]] { +; HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() +; HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4 +; HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 +; HSA-NEXT: ret void ; %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() %val = load i32, ptr addrspace(4) %dispatch.ptr @@ -417,17 +300,11 @@ define amdgpu_kernel void @use_kernarg_segment_ptr(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast -; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr -; AKF_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr -; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast +; HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] { +; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr +; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(3) %ptr to ptr store volatile i32 0, ptr %stof @@ -435,17 +312,11 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr } define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast -; AKF_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; AKF_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast +; HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { +; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof @@ -526,59 +397,39 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #1 { } define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_is_shared -; AKF_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]]) -; AKF_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32 -; AKF_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) undef, align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared -; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] { -; ATTRIBUTOR_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]]) -; ATTRIBUTOR_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32 -; ATTRIBUTOR_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) undef, align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_is_shared +; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] { +; HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]]) +; HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32 +; HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) poison, align 4 +; HSA-NEXT: ret void ; %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %ptr) %ext = zext i1 %is.shared to i32 - store i32 %ext, ptr addrspace(1) undef + store i32 %ext, ptr addrspace(1) poison ret void } define amdgpu_kernel void @use_is_private(ptr %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_is_private -; AKF_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) -; AKF_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32 -; AKF_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) undef, align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private -; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] { -; ATTRIBUTOR_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) -; ATTRIBUTOR_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32 -; ATTRIBUTOR_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) undef, align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_is_private +; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] { +; HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32 +; HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) poison, align 4 +; HSA-NEXT: ret void ; %is.private = call i1 @llvm.amdgcn.is.private(ptr %ptr) %ext = zext i1 %is.private to i32 - store i32 %ext, ptr addrspace(1) undef + store i32 %ext, ptr addrspace(1) poison ret void } define amdgpu_kernel void @use_alloca() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca -; AKF_HSA-SAME: () #[[ATTR2:[0-9]+]] { -; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca -; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { -; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; ATTRIBUTOR_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_alloca +; HSA-SAME: () #[[ATTR1]] { +; HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) +; HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 +; HSA-NEXT: ret void ; %alloca = alloca i32, addrspace(5) store i32 0, ptr addrspace(5) %alloca @@ -586,23 +437,14 @@ define amdgpu_kernel void @use_alloca() #1 { } define amdgpu_kernel void @use_alloca_non_entry_block() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block -; AKF_HSA-SAME: () #[[ATTR2]] { -; AKF_HSA-NEXT: entry: -; AKF_HSA-NEXT: br label [[BB:%.*]] -; AKF_HSA: bb: -; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block -; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { -; ATTRIBUTOR_HSA-NEXT: entry: -; ATTRIBUTOR_HSA-NEXT: br label [[BB:%.*]] -; ATTRIBUTOR_HSA: bb: -; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; ATTRIBUTOR_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block +; HSA-SAME: () #[[ATTR1]] { +; HSA-NEXT: entry: +; HSA-NEXT: br label [[BB:%.*]] +; HSA: bb: +; HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) +; HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 +; HSA-NEXT: ret void ; entry: br label %bb @@ -614,17 +456,11 @@ bb: } define void @use_alloca_func() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca_func -; AKF_HSA-SAME: () #[[ATTR2]] { -; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 -; AKF_HSA-NEXT: ret void -; -; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func -; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { -; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; ATTRIBUTOR_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 -; ATTRIBUTOR_HSA-NEXT: ret void +; HSA-LABEL: define {{[^@]+}}@use_alloca_func +; HSA-SAME: () #[[ATTR1]] { +; HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) +; HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 +; HSA-NEXT: ret void ; %alloca = alloca i32, addrspace(5) store i32 0, ptr addrspace(5) %alloca @@ -634,30 +470,21 @@ define void @use_alloca_func() #1 { attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind } -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} -;. -; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; AKF_HSA: attributes #[[ATTR1]] = { nounwind } -; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" } -;. -; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -;. ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. -; ATTRIBUTOR_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} +; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll index 89fe46d975309..7c3f8697c1d94 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=CHECK,AKF_CHECK %s -; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=CHECK,ATTRIBUTOR_CHECK %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor < %s | FileCheck %s declare i32 @llvm.r600.read.tgid.x() #0 declare i32 @llvm.r600.read.tgid.y() #0 @@ -27,17 +26,11 @@ define amdgpu_kernel void @use_tgid_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; -; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_y -; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { -; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y() -; ATTRIBUTOR_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: ret void +; CHECK-LABEL: define {{[^@]+}}@use_tgid_y +; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y() +; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: ret void ; %val = call i32 @llvm.r600.read.tgid.y() store i32 %val, ptr addrspace(1) %ptr @@ -45,21 +38,13 @@ define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; -; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y -; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { -; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: ret void +; CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y +; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() +; CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() +; CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tgid.y() store volatile i32 %val0, ptr addrspace(1) %ptr @@ -69,21 +54,13 @@ define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; -; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y -; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { -; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: ret void +; CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y +; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() +; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() +; CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tgid.x() %val1 = call i32 @llvm.r600.read.tgid.y() @@ -93,17 +70,11 @@ define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; -; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_z -; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { -; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z() -; ATTRIBUTOR_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: ret void +; CHECK-LABEL: define {{[^@]+}}@use_tgid_z +; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { +; CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z() +; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: ret void ; %val = call i32 @llvm.r600.read.tgid.z() store i32 %val, ptr addrspace(1) %ptr @@ -111,21 +82,13 @@ define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; -; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z -; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] { -; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: ret void +; CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z +; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() +; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() +; CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tgid.x() %val1 = call i32 @llvm.r600.read.tgid.z() @@ -135,21 +98,13 @@ define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; -; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z -; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] { -; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() -; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: ret void +; CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z +; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] { +; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() +; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() +; CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tgid.y() %val1 = call i32 @llvm.r600.read.tgid.z() @@ -159,25 +114,15 @@ define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; -; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z -; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4]] { -; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; ATTRIBUTOR_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tgid.z() -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: ret void +; CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z +; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4]] { +; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() +; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() +; CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tgid.z() +; CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tgid.x() %val1 = call i32 @llvm.r600.read.tgid.y() @@ -201,17 +146,11 @@ define amdgpu_kernel void @use_tidig_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y() -; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; -; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_y -; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] { -; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y() -; ATTRIBUTOR_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: ret void +; CHECK-LABEL: define {{[^@]+}}@use_tidig_y +; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] { +; CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y() +; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: ret void ; %val = call i32 @llvm.r600.read.tidig.y() store i32 %val, ptr addrspace(1) %ptr @@ -219,17 +158,11 @@ define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z() -; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; -; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_z -; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR6:[0-9]+]] { -; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z() -; ATTRIBUTOR_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: ret void +; CHECK-LABEL: define {{[^@]+}}@use_tidig_z +; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR6:[0-9]+]] { +; CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z() +; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: ret void ; %val = call i32 @llvm.r600.read.tidig.z() store i32 %val, ptr addrspace(1) %ptr @@ -253,21 +186,13 @@ define amdgpu_kernel void @use_tidig_x_tgid_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; -; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y -; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR7:[0-9]+]] { -; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y() -; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: ret void +; CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y +; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR7:[0-9]+]] { +; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y() +; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() +; CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tidig.y() %val1 = call i32 @llvm.r600.read.tgid.y() @@ -277,25 +202,15 @@ define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() -; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; -; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z -; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR8:[0-9]+]] { -; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() -; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() -; ATTRIBUTOR_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: ret void +; CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z +; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR8:[0-9]+]] { +; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() +; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() +; CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() +; CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tidig.x() %val1 = call i32 @llvm.r600.read.tidig.y() @@ -307,37 +222,21 @@ define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_all_workitems -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() -; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() -; AKF_CHECK-NEXT: [[VAL3:%.*]] = call i32 @llvm.r600.read.tgid.x() -; AKF_CHECK-NEXT: [[VAL4:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: [[VAL5:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; -; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_all_workitems -; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR9:[0-9]+]] { -; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() -; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() -; ATTRIBUTOR_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() -; ATTRIBUTOR_CHECK-NEXT: [[VAL3:%.*]] = call i32 @llvm.r600.read.tgid.x() -; ATTRIBUTOR_CHECK-NEXT: [[VAL4:%.*]] = call i32 @llvm.r600.read.tgid.y() -; ATTRIBUTOR_CHECK-NEXT: [[VAL5:%.*]] = call i32 @llvm.r600.read.tgid.z() -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4 -; ATTRIBUTOR_CHECK-NEXT: ret void +; CHECK-LABEL: define {{[^@]+}}@use_all_workitems +; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR9:[0-9]+]] { +; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() +; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() +; CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() +; CHECK-NEXT: [[VAL3:%.*]] = call i32 @llvm.r600.read.tgid.x() +; CHECK-NEXT: [[VAL4:%.*]] = call i32 @llvm.r600.read.tgid.y() +; CHECK-NEXT: [[VAL5:%.*]] = call i32 @llvm.r600.read.tgid.z() +; CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tidig.x() %val1 = call i32 @llvm.r600.read.tidig.y() @@ -394,17 +293,14 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind } ;. -; AKF_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; AKF_CHECK: attributes #[[ATTR1]] = { nounwind } -;. -; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index a9e092fa39fbe..cd405fabf002d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -220,7 +220,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -240,6 +239,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -257,7 +257,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 @@ -266,7 +265,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -499,7 +497,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -519,6 +516,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -537,7 +535,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -883,7 +880,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -893,7 +889,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -908,7 +904,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -940,6 +935,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 @@ -1292,7 +1288,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_clause 0x1 ; GFX12W64-NEXT: s_load_b32 s3, s[4:5], 0x44 ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, s3 @@ -1303,7 +1298,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1318,7 +1313,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -1352,6 +1346,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 @@ -1686,7 +1681,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1707,6 +1701,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1724,7 +1719,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 @@ -1733,7 +1727,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1970,7 +1963,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1992,6 +1984,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2009,7 +2002,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -2025,6 +2017,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] @@ -2357,7 +2350,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -2367,7 +2359,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -2382,7 +2374,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -2415,6 +2406,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 9577230c6c52e..adc91d56c3c27 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -262,7 +262,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -586,7 +585,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -629,7 +627,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB1_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -1018,7 +1015,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 @@ -1047,7 +1043,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 @@ -1509,6 +1504,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 @@ -1543,7 +1539,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 @@ -1843,7 +1838,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[4:5], exec -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1887,7 +1881,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB3_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 @@ -2227,7 +2220,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2274,7 +2266,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB4_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -2719,7 +2710,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s6 ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s7 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -2751,7 +2741,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 @@ -3378,20 +3367,22 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -3399,6 +3390,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 @@ -3407,9 +3399,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, s2 ; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -3461,9 +3455,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc ; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], null @@ -3486,20 +3482,22 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -3523,7 +3521,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v6, s7, 16 ; GFX1232_DPP-NEXT: v_writelane_b32 v7, s8, 16 -; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo @@ -3547,9 +3544,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10 ; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo ; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null @@ -3809,7 +3807,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -4139,7 +4136,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -4183,7 +4179,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB7_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -4573,7 +4568,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 @@ -4602,7 +4596,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 @@ -5064,6 +5057,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 @@ -5098,7 +5092,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 @@ -5412,7 +5405,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[4:5], exec -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5459,7 +5451,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB9_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 @@ -5813,7 +5804,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5864,7 +5854,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB10_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -6313,7 +6302,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s6 ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s7 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 @@ -6345,7 +6333,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 @@ -6972,20 +6959,22 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -6993,6 +6982,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 @@ -7001,9 +6991,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, s2 ; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7055,9 +7047,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1264_DPP-NEXT: s_wait_alu 0xfffd ; GFX1264_DPP-NEXT: v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc ; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], null @@ -7080,20 +7074,22 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1 +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo ; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf @@ -7117,7 +7113,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v6, s7, 16 ; GFX1232_DPP-NEXT: v_writelane_b32 v7, s8, 16 -; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo @@ -7141,9 +7136,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10 ; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1232_DPP-NEXT: s_wait_alu 0xfffd ; GFX1232_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo ; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 905a515d7c125..8c6224cc86284 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -219,7 +219,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -239,6 +238,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -256,7 +256,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 @@ -265,7 +264,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -498,7 +496,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -518,6 +515,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -536,7 +534,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -882,7 +879,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -892,7 +888,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -907,7 +903,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -939,6 +934,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 @@ -1273,7 +1269,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1294,6 +1289,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1311,7 +1307,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 @@ -1320,7 +1315,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB4_2: -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1557,7 +1551,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1579,6 +1572,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1596,7 +1590,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -1612,6 +1605,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] @@ -1944,7 +1938,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN @@ -1954,7 +1947,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1969,7 +1962,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -2002,6 +1994,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 9801e6ede5eeb..63b46eba41225 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -226,7 +226,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -247,6 +246,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -264,7 +264,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 @@ -273,7 +272,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -513,7 +511,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -534,6 +531,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -552,7 +550,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -905,7 +902,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN @@ -915,7 +911,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -930,7 +926,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -962,6 +957,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 @@ -1431,7 +1427,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1453,6 +1448,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1470,7 +1466,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 @@ -1479,7 +1474,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1723,7 +1717,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 @@ -1746,6 +1739,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1763,7 +1757,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -1779,6 +1772,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] @@ -2118,7 +2112,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN @@ -2128,7 +2121,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_wait_alu 0xfffe +; GFX12W64-NEXT: s_wait_alu 0xf1ff ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -2143,7 +2136,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 @@ -2176,6 +2168,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: s_wait_alu 0xf1ff ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll index fc13b86566f76..616867481d177 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -24,10 +24,10 @@ entry: attributes #1 = {"amdgpu-flat-work-group-size"="64,128"} ; CHECK-LABEL: {{^}}min_128_max_128: -; CHECK: SGPRBlocks: 0 -; CHECK: VGPRBlocks: 0 -; CHECK: NumSGPRsForWavesPerEU: 1 -; CHECK: NumVGPRsForWavesPerEU: 1 +; CHECK: SGPRBlocks: 8 +; CHECK: VGPRBlocks: 7 +; CHECK: NumSGPRsForWavesPerEU: 65 +; CHECK: NumVGPRsForWavesPerEU: 29 define amdgpu_kernel void @min_128_max_128() #2 { entry: ret void @@ -35,9 +35,9 @@ entry: attributes #2 = {"amdgpu-flat-work-group-size"="128,128"} ; CHECK-LABEL: {{^}}min_1024_max_1024 -; CHECK: SGPRBlocks: 0 +; CHECK: SGPRBlocks: 8 ; CHECK: VGPRBlocks: 10 -; CHECK: NumSGPRsForWavesPerEU: 2{{$}} +; CHECK: NumSGPRsForWavesPerEU: 65 ; CHECK: NumVGPRsForWavesPerEU: 43 @var = addrspace(1) global float 0.0 define amdgpu_kernel void @min_1024_max_1024() #3 { diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll index 678c3a0158ec1..6ec6dce460c01 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll @@ -27,7 +27,7 @@ define internal void @callee_1_2_3() { define amdgpu_kernel void @kernel_1_2_3() #0 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_1_2_3( -; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: call void @callee_1_2_3() ; CHECK-NEXT: call void @extern_callee() ; CHECK-NEXT: call void @dummy() @@ -44,7 +44,7 @@ attributes #0 = {"amdgpu-max-num-workgroups"="1,2,3"} ; -> 100,10,99 define internal void @callee_merge_100_8_32__16_10_99() { ; CHECK-LABEL: define internal void @callee_merge_100_8_32__16_10_99( -; CHECK-SAME: ) #[[ATTR3:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: call void @dummy() ; CHECK-NEXT: ret void ; @@ -54,7 +54,7 @@ define internal void @callee_merge_100_8_32__16_10_99() { define amdgpu_kernel void @kernel_100_8_32() #1 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_100_8_32( -; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] { ; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99() ; CHECK-NEXT: ret void ; @@ -64,7 +64,7 @@ define amdgpu_kernel void @kernel_100_8_32() #1 { define amdgpu_cs void @amdgpu_cs_100_8_32() #1 { ; CHECK-LABEL: define amdgpu_cs void @amdgpu_cs_100_8_32( -; CHECK-SAME: ) #[[ATTR4]] { +; CHECK-SAME: ) #[[ATTR3]] { ; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99() ; CHECK-NEXT: ret void ; @@ -76,7 +76,7 @@ attributes #1 = {"amdgpu-max-num-workgroups"="100,8,32"} define amdgpu_kernel void @kernel_16_10_99() #2 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_16_10_99( -; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99() ; CHECK-NEXT: call void @dummy() ; CHECK-NEXT: ret void @@ -110,7 +110,7 @@ define internal void @callee_x_worst_case() { define amdgpu_kernel void @kernel_x_maximum() #3 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_x_maximum( -; CHECK-SAME: ) #[[ATTR6:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { ; CHECK-NEXT: call void @merge_to_worst_case() ; CHECK-NEXT: call void @callee_x_worst_case() ; CHECK-NEXT: call void @dummy() @@ -126,7 +126,7 @@ attributes #3 = {"amdgpu-max-num-workgroups"="4294967295,1,1"} define amdgpu_kernel void @kernel_y_maximum() #4 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_y_maximum( -; CHECK-SAME: ) #[[ATTR7:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR6:[0-9]+]] { ; CHECK-NEXT: call void @merge_to_worst_case() ; CHECK-NEXT: call void @dummy() ; CHECK-NEXT: ret void @@ -140,7 +140,7 @@ attributes #4 = {"amdgpu-max-num-workgroups"="1,4294967295,1"} define amdgpu_kernel void @kernel_z_maximum() #5 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_z_maximum( -; CHECK-SAME: ) #[[ATTR8:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR7:[0-9]+]] { ; CHECK-NEXT: call void @merge_to_worst_case() ; CHECK-NEXT: call void @dummy() ; CHECK-NEXT: ret void @@ -155,7 +155,7 @@ attributes #5 = {"amdgpu-max-num-workgroups"="1,1,4294967295"} ; Make sure the attribute isn't lost from the callee. define internal void @annotated_callee_from_unannotated_kernel() #6 { ; CHECK-LABEL: define internal void @annotated_callee_from_unannotated_kernel( -; CHECK-SAME: ) #[[ATTR9:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR8:[0-9]+]] { ; CHECK-NEXT: call void @dummy() ; CHECK-NEXT: ret void ; @@ -167,7 +167,7 @@ attributes #6 = {"amdgpu-max-num-workgroups"="42,99,123"} define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee() { ; CHECK-LABEL: define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee( -; CHECK-SAME: ) #[[ATTR10:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: call void @annotated_callee_from_unannotated_kernel() ; CHECK-NEXT: ret void ; @@ -178,7 +178,7 @@ define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee() { define internal void @annotated_callee_merge_caller() #7 { ; CHECK-LABEL: define internal void @annotated_callee_merge_caller( -; CHECK-SAME: ) #[[ATTR11:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR9:[0-9]+]] { ; CHECK-NEXT: call void @dummy() ; CHECK-NEXT: ret void ; @@ -190,7 +190,7 @@ attributes #7 = {"amdgpu-max-num-workgroups"="512,256,1024"} define amdgpu_kernel void @call_annotated_callee_merge_caller() #8 { ; CHECK-LABEL: define amdgpu_kernel void @call_annotated_callee_merge_caller( -; CHECK-SAME: ) #[[ATTR12:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR10:[0-9]+]] { ; CHECK-NEXT: call void @annotated_callee_merge_caller() ; CHECK-NEXT: ret void ; @@ -212,7 +212,7 @@ define internal void @called_by_explicit_worst_case() { define amdgpu_kernel void @kernel_explicit_worst_case() #9 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_explicit_worst_case( -; CHECK-SAME: ) #[[ATTR13:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR11:[0-9]+]] { ; CHECK-NEXT: call void @called_by_explicit_worst_case() ; CHECK-NEXT: ret void ; @@ -225,16 +225,14 @@ attributes #9 = {"amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295"} ;. ; CHECK: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="1,2,3" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="1,2,3" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="100,10,99" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-max-num-workgroups"="100,8,32" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-max-num-workgroups"="16,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-max-num-workgroups"="4294967295,1,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-max-num-workgroups"="1,4294967295,1" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-max-num-workgroups"="1,1,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR9]] = { "amdgpu-max-num-workgroups"="42,99,123" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR10]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR11]] = { "amdgpu-max-num-workgroups"="256,128,1024" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR12]] = { "amdgpu-max-num-workgroups"="256,128,2048" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR13]] = { "amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="100,10,99" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="100,8,32" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-max-num-workgroups"="16,10,99" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-max-num-workgroups"="4294967295,1,1" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-max-num-workgroups"="1,4294967295,1" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-max-num-workgroups"="1,1,4294967295" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-max-num-workgroups"="42,99,123" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-max-num-workgroups"="256,128,1024" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR10]] = { "amdgpu-max-num-workgroups"="256,128,2048" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR11]] = { "amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index 14519f5a5e77c..7d2248c4f4501 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -200,3 +200,28 @@ entry: ret void } attributes #10 = {"amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="2,2"} + +; Minimum 2 waves, maximum limited by LDS usage. +; CHECK-LABEL: {{^}}empty_at_least_2_lds_limited: +; CHECK: SGPRBlocks: 12 +; CHECK: VGPRBlocks: 12 +; CHECK: NumSGPRsForWavesPerEU: 102 +; CHECK: NumVGPRsForWavesPerEU: 49 +define amdgpu_kernel void @empty_at_least_2_lds_limited() #11 { +entry: + ret void +} +attributes #11 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" "amdgpu-lds-size"="16384"} + +; Minimum 2 waves, maximum limited by LDS usage. Requested maximum within spec +; but above achievable occupancy has no effect. +; CHECK-LABEL: {{^}}empty_at_least_2_lds_limited_max_above_achievable: +; CHECK: SGPRBlocks: 12 +; CHECK: VGPRBlocks: 12 +; CHECK: NumSGPRsForWavesPerEU: 102 +; CHECK: NumVGPRsForWavesPerEU: 49 +define amdgpu_kernel void @empty_at_least_2_lds_limited_max_above_achievable() #12 { +entry: + ret void +} +attributes #12 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,10" "amdgpu-lds-size"="16384"} diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll new file mode 100644 index 0000000000000..534b8d75bc5a4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll @@ -0,0 +1,155 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s + +; +; None of these functions should have the attribute amdgpu-no-flat-scratch-init. In these tests +; we manually set the attribute for the functions. The purpose is to test how the amdgpu-attributor pass +; handles this situation. +; +;; tests of addrspacecast + +define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { +; GFX9-LABEL: define void @with_private_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @with_private_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 { +; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { +; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 { +; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +;; tests of addrspacecast in a constant + +define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) #0 { +; GFX9-LABEL: define amdgpu_kernel void @private_constant_expression_use( +; GFX9-SAME: ptr addrspace(1) nocapture [[OUT:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) [[OUT]], align 8 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @private_constant_expression_use( +; GFX10-SAME: ptr addrspace(1) nocapture [[OUT:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) [[OUT]], align 8 +; GFX10-NEXT: ret void +; + store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8 + ret void +} + +;; tests of intrinsics + +define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 { +; GFX9-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel( +; GFX9-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) [[PTR]]) +; GFX9-NEXT: store volatile i32 7, ptr [[TMP1]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel( +; GFX10-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) [[PTR]]) +; GFX10-NEXT: store volatile i32 7, ptr [[TMP1]], align 4 +; GFX10-NEXT: ret void +; + %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) + store volatile i32 7, ptr %1, align 4 + ret void +} + +define void @calls_intrin_ascast(ptr addrspace(3) %ptr) #0 { +; GFX9-LABEL: define void @calls_intrin_ascast( +; GFX9-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) [[PTR]]) +; GFX9-NEXT: store volatile i32 7, ptr [[TMP1]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @calls_intrin_ascast( +; GFX10-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) [[PTR]]) +; GFX10-NEXT: store volatile i32 7, ptr [[TMP1]], align 4 +; GFX10-NEXT: ret void +; + %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) + store volatile i32 7, ptr %1, align 4 + ret void +} + +define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 { +; GFX9-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel( +; GFX9-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: call void @calls_intrin_ascast(ptr addrspace(3) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel( +; GFX10-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: call void @calls_intrin_ascast(ptr addrspace(3) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @calls_intrin_ascast(ptr addrspace(3) %ptr) + ret void +} + +attributes #0 = { "amdgpu-no-flat-scratch-init" } +;. +; GFX9: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx900" } +;. +; GFX10: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll index f9f7c4193b07b..dfe966882a9ad 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll @@ -825,21 +825,6 @@ define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) ret void } -define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) { -; GFX9-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel( -; GFX9-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] { -; GFX9-NEXT: call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) [[PTR]]) -; GFX9-NEXT: ret void -; -; GFX10-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel( -; GFX10-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] { -; GFX10-NEXT: call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) [[PTR]]) -; GFX10-NEXT: ret void -; - call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) - ret void -} - define amdgpu_kernel void @with_inline_asm() { ; GFX9-LABEL: define amdgpu_kernel void @with_inline_asm( ; GFX9-SAME: ) #[[ATTR3]] { diff --git a/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll b/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll index 8481cea4d7c35..a9efcdcb0af6d 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll @@ -51,7 +51,7 @@ bb5: ; preds = %bb5, %bb3 define amdgpu_kernel void @entry() { ; CHECK-LABEL: define {{[^@]+}}@entry -; CHECK-SAME: () #[[ATTR1:[0-9]+]] { +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [[TMP0:%.*]], align 8, addrspace(5) ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr ; CHECK-NEXT: [[ARST:%.*]] = call double @baz(ptr [[CAST]]) @@ -64,5 +64,4 @@ define amdgpu_kernel void @entry() { } ;. ; CHECK: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir index effb47808a5d9..1e7bd75fda085 100644 --- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir +++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir @@ -459,6 +459,38 @@ body: | ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; GCN-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr40, 4352 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 2b2478bb4f859..c2125e3435bf6 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -25056,8 +25056,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GCN-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GCN-NEXT: v_log_f32_e32 v0, v0 ; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 @@ -25083,8 +25082,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x800000 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_log_f32_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x3f317217 @@ -25108,8 +25106,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_mov_b32 s4, 0x800000 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: s_mov_b32 s4, 0x7f800000 @@ -25142,8 +25139,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x800000 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x3f317217 @@ -25172,8 +25168,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -25198,30 +25193,28 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo ; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.log.bf16(bfloat %a) @@ -25237,8 +25230,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GCN-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GCN-NEXT: v_log_f32_e32 v0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -25253,8 +25245,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x800000 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_log_f32_e32 v0, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -25269,8 +25260,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_mov_b32 s4, 0x800000 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -25291,8 +25281,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x800000 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -25312,9 +25301,8 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -25332,21 +25320,20 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.log2.bf16(bfloat %a) @@ -25363,8 +25350,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GCN-NEXT: v_mov_b32_e32 v1, 0x411a209b ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GCN-NEXT: v_log_f32_e32 v0, v0 ; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 @@ -25390,8 +25376,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x800000 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_log_f32_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -25415,8 +25400,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_mov_b32 s4, 0x800000 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: s_mov_b32 s4, 0x7f800000 @@ -25449,8 +25433,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x800000 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -25479,8 +25462,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -25505,30 +25487,28 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo ; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.log10.bf16(bfloat %a) @@ -37515,14 +37495,6 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GCN-LABEL: v_vselect_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v31, s34, 0 -; GCN-NEXT: v_writelane_b32 v31, s35, 1 -; GCN-NEXT: v_writelane_b32 v31, s30, 2 -; GCN-NEXT: v_writelane_b32 v31, s31, 3 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_and_b32_e32 v0, 1, v1 @@ -37560,21 +37532,21 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GCN-NEXT: v_and_b32_e32 v8, 1, v14 ; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v8 +; GCN-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v8 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_and_b32_e32 v9, 1, v15 -; GCN-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v9 +; GCN-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v9 ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[34:35] +; GCN-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[42:43] ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_cndmask_b32_e64 v14, v9, v8, s[30:31] +; GCN-NEXT: v_cndmask_b32_e64 v14, v9, v8, s[40:41] ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(1) @@ -37650,14 +37622,6 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_readlane_b32 s30, v31, 2 -; GCN-NEXT: v_readlane_b32 s31, v31, 3 -; GCN-NEXT: v_readlane_b32 s35, v31, 1 -; GCN-NEXT: v_readlane_b32 s34, v31, 0 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_vselect_v16bf16: @@ -37798,13 +37762,6 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-LABEL: v_vselect_v16bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v31, s34, 0 -; GFX8-NEXT: v_writelane_b32 v31, s35, 1 -; GFX8-NEXT: v_writelane_b32 v31, s30, 2 -; GFX8-NEXT: v_writelane_b32 v31, s31, 3 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 @@ -37834,9 +37791,9 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v30 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v0, s[28:29] @@ -37862,9 +37819,9 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v23, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v23, s[40:41] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v0, v1, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v0, v1, s[42:43] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v19 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17] @@ -37882,19 +37839,11 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; GFX8-NEXT: v_readlane_b32 s30, v31, 2 ; GFX8-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_readlane_b32 s31, v31, 3 -; GFX8-NEXT: v_readlane_b32 s35, v31, 1 -; GFX8-NEXT: v_readlane_b32 s34, v31, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_vselect_v16bf16: @@ -38904,36 +38853,8 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_writelane_b32 v34, s37, 3 ; GFX8-NEXT: v_writelane_b32 v34, s38, 4 ; GFX8-NEXT: v_writelane_b32 v34, s39, 5 -; GFX8-NEXT: v_writelane_b32 v34, s40, 6 -; GFX8-NEXT: v_writelane_b32 v34, s41, 7 -; GFX8-NEXT: v_writelane_b32 v34, s42, 8 -; GFX8-NEXT: v_writelane_b32 v34, s43, 9 -; GFX8-NEXT: v_writelane_b32 v34, s44, 10 -; GFX8-NEXT: v_writelane_b32 v34, s45, 11 -; GFX8-NEXT: v_writelane_b32 v34, s46, 12 -; GFX8-NEXT: v_writelane_b32 v34, s47, 13 -; GFX8-NEXT: v_writelane_b32 v34, s48, 14 -; GFX8-NEXT: v_writelane_b32 v34, s49, 15 -; GFX8-NEXT: v_writelane_b32 v34, s50, 16 -; GFX8-NEXT: v_writelane_b32 v34, s51, 17 -; GFX8-NEXT: v_writelane_b32 v34, s52, 18 -; GFX8-NEXT: v_writelane_b32 v34, s53, 19 -; GFX8-NEXT: v_writelane_b32 v34, s54, 20 -; GFX8-NEXT: v_writelane_b32 v34, s55, 21 -; GFX8-NEXT: v_writelane_b32 v34, s56, 22 -; GFX8-NEXT: v_writelane_b32 v34, s57, 23 -; GFX8-NEXT: v_writelane_b32 v34, s58, 24 -; GFX8-NEXT: v_writelane_b32 v34, s59, 25 -; GFX8-NEXT: v_writelane_b32 v34, s60, 26 -; GFX8-NEXT: v_writelane_b32 v34, s61, 27 -; GFX8-NEXT: v_writelane_b32 v34, s62, 28 -; GFX8-NEXT: v_writelane_b32 v34, s63, 29 -; GFX8-NEXT: v_writelane_b32 v34, s64, 30 -; GFX8-NEXT: v_writelane_b32 v34, s65, 31 -; GFX8-NEXT: v_writelane_b32 v34, s66, 32 -; GFX8-NEXT: v_writelane_b32 v34, s67, 33 -; GFX8-NEXT: v_writelane_b32 v34, s30, 34 -; GFX8-NEXT: v_writelane_b32 v34, s31, 35 +; GFX8-NEXT: v_writelane_b32 v34, s30, 6 +; GFX8-NEXT: v_writelane_b32 v34, s31, 7 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 @@ -38963,43 +38884,43 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v16 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v17 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v18 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v19 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v20 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v21 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v22 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v23 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[50:51], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v24 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[52:53], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v25 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[54:55], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v26 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v27 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v28 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v29 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v30 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[64:65], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0 ; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[66:67], 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 ; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 @@ -39036,40 +38957,40 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v32 -; GFX8-NEXT: v_cndmask_b32_e64 v28, v33, v28, s[66:67] -; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[64:65] +; GFX8-NEXT: v_cndmask_b32_e64 v28, v33, v28, s[38:39] +; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[36:37] ; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v31 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[62:63] -; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, v31, s[60:61] +; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, v31, s[30:31] ; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[58:59] -; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[56:57] +; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[90:91] +; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] ; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v25 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[54:55] -; GFX8-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[52:53] +; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[78:79] +; GFX8-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] ; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v23 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[50:51] -; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[48:49] +; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[74:75] +; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] ; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v21 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[46:47] -; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[44:45] +; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[62:63] +; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] ; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v19 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[42:43] -; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[40:41] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[58:59] +; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] ; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[38:39] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[36:37] +; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[46:47] +; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v17, v33, v17, s[34:35] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v33, v17, s[42:43] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v13 ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[28:29] @@ -39122,7 +39043,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v31 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v32 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; GFX8-NEXT: v_readlane_b32 s30, v34, 34 +; GFX8-NEXT: v_readlane_b32 s30, v34, 6 ; GFX8-NEXT: v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -39131,35 +39052,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_readlane_b32 s31, v34, 35 -; GFX8-NEXT: v_readlane_b32 s67, v34, 33 -; GFX8-NEXT: v_readlane_b32 s66, v34, 32 -; GFX8-NEXT: v_readlane_b32 s65, v34, 31 -; GFX8-NEXT: v_readlane_b32 s64, v34, 30 -; GFX8-NEXT: v_readlane_b32 s63, v34, 29 -; GFX8-NEXT: v_readlane_b32 s62, v34, 28 -; GFX8-NEXT: v_readlane_b32 s61, v34, 27 -; GFX8-NEXT: v_readlane_b32 s60, v34, 26 -; GFX8-NEXT: v_readlane_b32 s59, v34, 25 -; GFX8-NEXT: v_readlane_b32 s58, v34, 24 -; GFX8-NEXT: v_readlane_b32 s57, v34, 23 -; GFX8-NEXT: v_readlane_b32 s56, v34, 22 -; GFX8-NEXT: v_readlane_b32 s55, v34, 21 -; GFX8-NEXT: v_readlane_b32 s54, v34, 20 -; GFX8-NEXT: v_readlane_b32 s53, v34, 19 -; GFX8-NEXT: v_readlane_b32 s52, v34, 18 -; GFX8-NEXT: v_readlane_b32 s51, v34, 17 -; GFX8-NEXT: v_readlane_b32 s50, v34, 16 -; GFX8-NEXT: v_readlane_b32 s49, v34, 15 -; GFX8-NEXT: v_readlane_b32 s48, v34, 14 -; GFX8-NEXT: v_readlane_b32 s47, v34, 13 -; GFX8-NEXT: v_readlane_b32 s46, v34, 12 -; GFX8-NEXT: v_readlane_b32 s45, v34, 11 -; GFX8-NEXT: v_readlane_b32 s44, v34, 10 -; GFX8-NEXT: v_readlane_b32 s43, v34, 9 -; GFX8-NEXT: v_readlane_b32 s42, v34, 8 -; GFX8-NEXT: v_readlane_b32 s41, v34, 7 -; GFX8-NEXT: v_readlane_b32 s40, v34, 6 +; GFX8-NEXT: v_readlane_b32 s31, v34, 7 ; GFX8-NEXT: v_readlane_b32 s39, v34, 5 ; GFX8-NEXT: v_readlane_b32 s38, v34, 4 ; GFX8-NEXT: v_readlane_b32 s37, v34, 3 @@ -39180,40 +39073,8 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v33, s34, 0 ; GFX9-NEXT: v_writelane_b32 v33, s35, 1 -; GFX9-NEXT: v_writelane_b32 v33, s36, 2 -; GFX9-NEXT: v_writelane_b32 v33, s37, 3 -; GFX9-NEXT: v_writelane_b32 v33, s38, 4 -; GFX9-NEXT: v_writelane_b32 v33, s39, 5 -; GFX9-NEXT: v_writelane_b32 v33, s40, 6 -; GFX9-NEXT: v_writelane_b32 v33, s41, 7 -; GFX9-NEXT: v_writelane_b32 v33, s42, 8 -; GFX9-NEXT: v_writelane_b32 v33, s43, 9 -; GFX9-NEXT: v_writelane_b32 v33, s44, 10 -; GFX9-NEXT: v_writelane_b32 v33, s45, 11 -; GFX9-NEXT: v_writelane_b32 v33, s46, 12 -; GFX9-NEXT: v_writelane_b32 v33, s47, 13 -; GFX9-NEXT: v_writelane_b32 v33, s48, 14 -; GFX9-NEXT: v_writelane_b32 v33, s49, 15 -; GFX9-NEXT: v_writelane_b32 v33, s50, 16 -; GFX9-NEXT: v_writelane_b32 v33, s51, 17 -; GFX9-NEXT: v_writelane_b32 v33, s52, 18 -; GFX9-NEXT: v_writelane_b32 v33, s53, 19 -; GFX9-NEXT: v_writelane_b32 v33, s54, 20 -; GFX9-NEXT: v_writelane_b32 v33, s55, 21 -; GFX9-NEXT: v_writelane_b32 v33, s56, 22 -; GFX9-NEXT: v_writelane_b32 v33, s57, 23 -; GFX9-NEXT: v_writelane_b32 v33, s58, 24 -; GFX9-NEXT: v_writelane_b32 v33, s59, 25 -; GFX9-NEXT: v_writelane_b32 v33, s60, 26 -; GFX9-NEXT: v_writelane_b32 v33, s61, 27 -; GFX9-NEXT: v_writelane_b32 v33, s62, 28 -; GFX9-NEXT: v_writelane_b32 v33, s63, 29 -; GFX9-NEXT: v_writelane_b32 v33, s64, 30 -; GFX9-NEXT: v_writelane_b32 v33, s65, 31 -; GFX9-NEXT: v_writelane_b32 v33, s66, 32 -; GFX9-NEXT: v_writelane_b32 v33, s67, 33 -; GFX9-NEXT: v_writelane_b32 v33, s30, 34 -; GFX9-NEXT: v_writelane_b32 v33, s31, 35 +; GFX9-NEXT: v_writelane_b32 v33, s30, 2 +; GFX9-NEXT: v_writelane_b32 v33, s31, 3 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 @@ -39241,45 +39102,45 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX9-NEXT: v_and_b32_e32 v0, 1, v12 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v19 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v18 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v21 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v23 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v22 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[50:51], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v25 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[52:53], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v24 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[54:55], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v27 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v26 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v29 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v28 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[64:65], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v30 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[66:67], 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 @@ -39313,42 +39174,42 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v29, v31, v32, s[66:67] +; GFX9-NEXT: v_cndmask_b32_e64 v29, v31, v32, s[34:35] ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[64:65] -; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v30, s[62:63] +; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31] +; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v30, s[94:95] ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v30, s[60:61] -; GFX9-NEXT: v_cndmask_b32_e64 v30, v26, v27, s[58:59] +; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v30, s[92:93] +; GFX9-NEXT: v_cndmask_b32_e64 v30, v26, v27, s[90:91] ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[56:57] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[54:55] +; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] +; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79] ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[52:53] -; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[50:51] +; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] +; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75] ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[48:49] -; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[46:47] +; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] +; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63] ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[44:45] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[42:43] +; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] +; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59] ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[40:41] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[38:39] +; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] +; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47] ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[36:37] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] +; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43] ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[30:31] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] ; GFX9-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29] ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 @@ -39378,7 +39239,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_readlane_b32 s30, v33, 34 +; GFX9-NEXT: v_readlane_b32 s30, v33, 2 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v2, v5, s4 ; GFX9-NEXT: v_perm_b32 v2, v4, v7, s4 @@ -39395,39 +39256,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX9-NEXT: v_perm_b32 v13, v26, v30, s4 ; GFX9-NEXT: v_perm_b32 v14, v28, v32, s4 ; GFX9-NEXT: v_perm_b32 v15, v31, v29, s4 -; GFX9-NEXT: v_readlane_b32 s31, v33, 35 -; GFX9-NEXT: v_readlane_b32 s67, v33, 33 -; GFX9-NEXT: v_readlane_b32 s66, v33, 32 -; GFX9-NEXT: v_readlane_b32 s65, v33, 31 -; GFX9-NEXT: v_readlane_b32 s64, v33, 30 -; GFX9-NEXT: v_readlane_b32 s63, v33, 29 -; GFX9-NEXT: v_readlane_b32 s62, v33, 28 -; GFX9-NEXT: v_readlane_b32 s61, v33, 27 -; GFX9-NEXT: v_readlane_b32 s60, v33, 26 -; GFX9-NEXT: v_readlane_b32 s59, v33, 25 -; GFX9-NEXT: v_readlane_b32 s58, v33, 24 -; GFX9-NEXT: v_readlane_b32 s57, v33, 23 -; GFX9-NEXT: v_readlane_b32 s56, v33, 22 -; GFX9-NEXT: v_readlane_b32 s55, v33, 21 -; GFX9-NEXT: v_readlane_b32 s54, v33, 20 -; GFX9-NEXT: v_readlane_b32 s53, v33, 19 -; GFX9-NEXT: v_readlane_b32 s52, v33, 18 -; GFX9-NEXT: v_readlane_b32 s51, v33, 17 -; GFX9-NEXT: v_readlane_b32 s50, v33, 16 -; GFX9-NEXT: v_readlane_b32 s49, v33, 15 -; GFX9-NEXT: v_readlane_b32 s48, v33, 14 -; GFX9-NEXT: v_readlane_b32 s47, v33, 13 -; GFX9-NEXT: v_readlane_b32 s46, v33, 12 -; GFX9-NEXT: v_readlane_b32 s45, v33, 11 -; GFX9-NEXT: v_readlane_b32 s44, v33, 10 -; GFX9-NEXT: v_readlane_b32 s43, v33, 9 -; GFX9-NEXT: v_readlane_b32 s42, v33, 8 -; GFX9-NEXT: v_readlane_b32 s41, v33, 7 -; GFX9-NEXT: v_readlane_b32 s40, v33, 6 -; GFX9-NEXT: v_readlane_b32 s39, v33, 5 -; GFX9-NEXT: v_readlane_b32 s38, v33, 4 -; GFX9-NEXT: v_readlane_b32 s37, v33, 3 -; GFX9-NEXT: v_readlane_b32 s36, v33, 2 +; GFX9-NEXT: v_readlane_b32 s31, v33, 3 ; GFX9-NEXT: v_readlane_b32 s35, v33, 1 ; GFX9-NEXT: v_readlane_b32 s34, v33, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll index f9ffa5ae57f3e..a6af63b816573 100644 --- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll +++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll @@ -9,53 +9,53 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: s_addc_u32 s13, s13, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; CHECK-NEXT: s_load_dwordx8 s[36:43], s[8:9], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[48:55], s[8:9], 0x0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b32 s12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_lg_u32 s40, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB0_8 +; CHECK-NEXT: s_cmp_lg_u32 s52, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_9 ; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i -; CHECK-NEXT: s_cmp_eq_u32 s42, 0 +; CHECK-NEXT: s_cmp_eq_u32 s54, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_4 ; CHECK-NEXT: ; %bb.2: ; %if.else251.i.i -; CHECK-NEXT: s_cmp_lg_u32 s43, 0 +; CHECK-NEXT: s_cmp_lg_u32 s55, 0 ; CHECK-NEXT: s_mov_b32 s17, 0 ; CHECK-NEXT: s_cselect_b32 s12, -1, 0 ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12 ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_mov_b32 s36, 0 -; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12 -; CHECK-NEXT: s_cbranch_vccz .LBB0_6 -; CHECK-NEXT: s_branch .LBB0_7 +; CHECK-NEXT: s_mov_b32 s18, 0 +; CHECK-NEXT: s_branch .LBB0_6 ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_mov_b32 s14, s12 ; CHECK-NEXT: s_mov_b32 s15, s12 ; CHECK-NEXT: s_mov_b32 s13, s12 -; CHECK-NEXT: s_mov_b64 s[38:39], s[14:15] -; CHECK-NEXT: s_mov_b64 s[36:37], s[12:13] -; CHECK-NEXT: s_branch .LBB0_7 +; CHECK-NEXT: s_mov_b64 s[50:51], s[14:15] +; CHECK-NEXT: s_mov_b64 s[48:49], s[12:13] +; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i -; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s41, 0 -; CHECK-NEXT: s_mov_b32 s36, 1.0 +; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s53, 0 +; CHECK-NEXT: s_mov_b32 s18, 1.0 ; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000 -; CHECK-NEXT: s_mov_b32 s37, s36 -; CHECK-NEXT: s_mov_b32 s38, s36 -; CHECK-NEXT: s_mov_b32 s39, s36 +; CHECK-NEXT: .LBB0_6: ; %Flow +; CHECK-NEXT: s_mov_b32 s48, 1.0 ; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12 -; CHECK-NEXT: s_cbranch_vccnz .LBB0_7 -; CHECK-NEXT: .LBB0_6: ; %if.end273.i.i +; CHECK-NEXT: s_mov_b32 s49, s48 +; CHECK-NEXT: s_mov_b32 s50, s48 +; CHECK-NEXT: s_mov_b32 s51, s48 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_8 +; CHECK-NEXT: ; %bb.7: ; %if.end273.i.i ; CHECK-NEXT: s_add_u32 s12, s8, 40 ; CHECK-NEXT: s_addc_u32 s13, s9, 0 -; CHECK-NEXT: s_getpc_b64 s[18:19] -; CHECK-NEXT: s_add_u32 s18, s18, _Z3dotDv3_fS_@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s19, s19, _Z3dotDv3_fS_@gotpcrel32@hi+12 +; CHECK-NEXT: s_getpc_b64 s[20:21] +; CHECK-NEXT: s_add_u32 s20, s20, _Z3dotDv3_fS_@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s21, s21, _Z3dotDv3_fS_@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[20:21], s[20:21], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v3, 10, v1 -; CHECK-NEXT: v_add_f32_e64 v1, s17, s36 +; CHECK-NEXT: v_add_f32_e64 v1, s17, s18 ; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], s[12:13] ; CHECK-NEXT: s_mov_b32 s12, s14 @@ -65,26 +65,26 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s14, s16 -; CHECK-NEXT: s_mov_b32 s36, 0 +; CHECK-NEXT: s_mov_b32 s48, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] ; CHECK-NEXT: s_mov_b64 s[8:9], s[34:35] -; CHECK-NEXT: s_mov_b32 s37, s36 -; CHECK-NEXT: s_mov_b32 s38, s36 -; CHECK-NEXT: s_mov_b32 s39, s36 -; CHECK-NEXT: .LBB0_7: ; %if.end294.i.i +; CHECK-NEXT: s_mov_b32 s49, s48 +; CHECK-NEXT: s_mov_b32 s50, s48 +; CHECK-NEXT: s_mov_b32 s51, s48 +; CHECK-NEXT: .LBB0_8: ; %if.end294.i.i ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; CHECK-NEXT: .LBB0_8: ; %kernel_direct_lighting.exit +; CHECK-NEXT: .LBB0_9: ; %kernel_direct_lighting.exit ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20 -; CHECK-NEXT: v_mov_b32_e32 v0, s36 +; CHECK-NEXT: v_mov_b32_e32 v0, s48 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s37 -; CHECK-NEXT: v_mov_b32_e32 v2, s38 -; CHECK-NEXT: v_mov_b32_e32 v3, s39 +; CHECK-NEXT: v_mov_b32_e32 v1, s49 +; CHECK-NEXT: v_mov_b32_e32 v2, s50 +; CHECK-NEXT: v_mov_b32_e32 v3, s51 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 1e1e450aa987b..43a291924471a 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -15,16 +15,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr17, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) - ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 0, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) + ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc - ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 8, implicit-def $scc + ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec @@ -35,7 +35,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -43,10 +43,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46, $sgpr47, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 @@ -56,15 +54,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vgpr15 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr17 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.57, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr4 = COPY renamable $sgpr25, implicit $exec @@ -81,15 +81,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -109,32 +109,31 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.Flow20: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr19 = COPY renamable $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr18 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr21 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr20 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr23 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr22 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr25 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr24 = COPY $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr19 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr21 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr20 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr23 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr22 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr25 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr24 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.62, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.8.Flow32: ; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr18_sgpr19, implicit-def $exec, implicit-def $scc, implicit $exec @@ -143,58 +142,58 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.9.bb89: ; GFX90A-NEXT: successors: %bb.10(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr58_sgpr59, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.11.bb84: ; GFX90A-NEXT: successors: %bb.12(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: ; GFX90A-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.16, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.15.bb72: ; GFX90A-NEXT: successors: %bb.16(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr8 = S_ADD_U32 renamable $sgpr8, 48, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit killed $scc @@ -204,113 +203,113 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr13 = COPY killed renamable $sgpr15 ; GFX90A-NEXT: $sgpr14 = COPY killed renamable $sgpr16 ; GFX90A-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr18_sgpr19, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.16.Flow36: ; GFX90A-NEXT: successors: %bb.17(0x40000000), %bb.18(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr66_sgpr67, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.18, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.17.bb67: ; GFX90A-NEXT: successors: %bb.18(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.18.Flow37: ; GFX90A-NEXT: successors: %bb.19(0x40000000), %bb.20(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr64_sgpr65, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.20, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.19.bb62: ; GFX90A-NEXT: successors: %bb.20(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.20.Flow38: ; GFX90A-NEXT: successors: %bb.21(0x40000000), %bb.22(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.22, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.21.bb54: ; GFX90A-NEXT: successors: %bb.22(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.22.Flow39: ; GFX90A-NEXT: successors: %bb.23(0x40000000), %bb.24(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.24, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.23.bb47: ; GFX90A-NEXT: successors: %bb.24(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.24.Flow40: ; GFX90A-NEXT: successors: %bb.25(0x40000000), %bb.26(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.26, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.25.bb40: ; GFX90A-NEXT: successors: %bb.26(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.26.Flow41: ; GFX90A-NEXT: successors: %bb.27(0x40000000), %bb.28(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.28, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.27.bb33: ; GFX90A-NEXT: successors: %bb.28(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.28.Flow42: ; GFX90A-NEXT: successors: %bb.34(0x40000000), %bb.29(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec @@ -319,7 +318,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.29.Flow43: ; GFX90A-NEXT: successors: %bb.30(0x40000000), %bb.31(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc @@ -327,17 +326,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.30.bb19: ; GFX90A-NEXT: successors: %bb.31(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.31.Flow44: ; GFX90A-NEXT: successors: %bb.32(0x40000000), %bb.33(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr68_sgpr69, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr68_sgpr69, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.33, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.32.UnifiedUnreachableBlock: @@ -353,32 +352,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.34.bb26: ; GFX90A-NEXT: successors: %bb.29(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr68_sgpr69, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.29 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -396,25 +395,27 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.36.Flow21: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr44_sgpr45, $sgpr42_sgpr43, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -433,39 +434,39 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.38.Flow22: ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_ANDN2_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr56_sgpr57, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr46_sgpr47, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.36 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr66_sgpr67 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -483,33 +484,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.40.Flow23: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr48_sgpr49, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.38 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -517,10 +517,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec :: (load (s8) from %ir.i42, addrspace 1) ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr18, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -535,47 +534,46 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.46, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr48_sgpr49, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.40 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc + ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr17, 16, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr50_sgpr51, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.48, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.44: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr56, $vgpr47, $vgpr18, $vgpr30, $vgpr31, $vgpr58, $vgpr61, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr57, $vgpr63, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr46, $vgpr45, $vgpr2, $vgpr3, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr60, $vgpr62 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr57, $vgpr56, $vgpr18, $vgpr30, $vgpr31, $vgpr60, $vgpr62, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $vgpr61, $vgpr58, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr3, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr63 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -588,37 +586,36 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.45.Flow26: ; GFX90A-NEXT: successors: %bb.47(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec :: (load (s8) from %ir.i49, addrspace 1) + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec @@ -636,57 +633,56 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.47.Flow25: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr56_sgpr57, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr48_sgpr49, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.42 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.49: ; GFX90A-NEXT: successors: %bb.44(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: S_BRANCH %bb.44 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.54, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.51: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -698,25 +694,24 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr50_sgpr51 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr52_sgpr53 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF @@ -727,18 +722,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) ; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 @@ -752,20 +746,19 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.52, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.55.Flow29: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec @@ -774,31 +767,29 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr56, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr10, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr46, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr47, killed $vgpr10, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.57: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $exec, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr15 = COPY killed renamable $sgpr23, implicit $exec - ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -812,43 +803,40 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr40_vgpr41 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr46_vgpr47 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = COPY renamable $vgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = COPY renamable $vgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr16 = COPY renamable $vgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr53 = COPY renamable $vgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = COPY renamable $vgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $vgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr14 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr53 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr13 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.7 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.58.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec - ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr33, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr23 = S_MOV_B32 0 - ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) - ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF @@ -859,31 +847,31 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: $sgpr52_sgpr53 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr54_sgpr55 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.56, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.60.Flow31: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr52_sgpr53, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.61.Flow30: ; GFX90A-NEXT: successors: %bb.55(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr50_sgpr51, killed renamable $sgpr56_sgpr57, implicit-def dead $scc + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr52_sgpr53, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.62.bb140: ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -891,14 +879,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.63.Flow13: ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.64.bb159: ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec @@ -907,21 +895,21 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.65.Flow10: ; GFX90A-NEXT: successors: %bb.66(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.66.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = COPY $exec ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.67.bb161: ; GFX90A-NEXT: successors: %bb.65(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec @@ -940,7 +928,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb174: ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec @@ -956,14 +944,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.69.Flow: ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.70.bb186: ; GFX90A-NEXT: successors: %bb.71(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec @@ -992,14 +980,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.71.Flow9: ; GFX90A-NEXT: successors: %bb.63(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.72.bb196: ; GFX90A-NEXT: successors: %bb.69(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll index c0f3323fb8db6..3708af395f8b9 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -900,9 +900,8 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_waitcnt expcnt(1) +; CHECK-NEXT: s_waitcnt expcnt(0) ; CHECK-NEXT: v_writelane_b32 v0, s33, 0 ; CHECK-NEXT: v_writelane_b32 v0, s34, 1 ; CHECK-NEXT: v_writelane_b32 v0, s35, 2 @@ -910,73 +909,40 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v0, s37, 4 ; CHECK-NEXT: v_writelane_b32 v0, s38, 5 ; CHECK-NEXT: v_writelane_b32 v0, s39, 6 -; CHECK-NEXT: v_writelane_b32 v0, s40, 7 -; CHECK-NEXT: v_writelane_b32 v0, s41, 8 -; CHECK-NEXT: v_writelane_b32 v0, s42, 9 -; CHECK-NEXT: v_writelane_b32 v0, s43, 10 -; CHECK-NEXT: v_writelane_b32 v0, s44, 11 -; CHECK-NEXT: v_writelane_b32 v0, s45, 12 -; CHECK-NEXT: v_writelane_b32 v0, s46, 13 -; CHECK-NEXT: v_writelane_b32 v0, s47, 14 -; CHECK-NEXT: v_writelane_b32 v0, s48, 15 -; CHECK-NEXT: v_writelane_b32 v0, s49, 16 -; CHECK-NEXT: v_writelane_b32 v0, s50, 17 -; CHECK-NEXT: v_writelane_b32 v0, s51, 18 -; CHECK-NEXT: v_writelane_b32 v0, s52, 19 -; CHECK-NEXT: v_writelane_b32 v0, s53, 20 -; CHECK-NEXT: v_writelane_b32 v0, s54, 21 -; CHECK-NEXT: v_writelane_b32 v0, s55, 22 -; CHECK-NEXT: v_writelane_b32 v0, s56, 23 -; CHECK-NEXT: v_writelane_b32 v0, s57, 24 -; CHECK-NEXT: v_writelane_b32 v0, s58, 25 -; CHECK-NEXT: v_writelane_b32 v0, s59, 26 -; CHECK-NEXT: v_writelane_b32 v0, s60, 27 -; CHECK-NEXT: v_writelane_b32 v0, s61, 28 -; CHECK-NEXT: v_writelane_b32 v0, s62, 29 -; CHECK-NEXT: v_writelane_b32 v0, s63, 30 -; CHECK-NEXT: v_writelane_b32 v0, s64, 31 -; CHECK-NEXT: v_writelane_b32 v0, s65, 32 -; CHECK-NEXT: v_writelane_b32 v0, s66, 33 -; CHECK-NEXT: v_writelane_b32 v0, s67, 34 -; CHECK-NEXT: v_writelane_b32 v0, s68, 35 -; CHECK-NEXT: v_writelane_b32 v0, s69, 36 -; CHECK-NEXT: v_writelane_b32 v0, s70, 37 -; CHECK-NEXT: v_writelane_b32 v0, s71, 38 -; CHECK-NEXT: v_writelane_b32 v0, s72, 39 -; CHECK-NEXT: v_writelane_b32 v0, s73, 40 -; CHECK-NEXT: v_writelane_b32 v0, s74, 41 -; CHECK-NEXT: v_writelane_b32 v0, s75, 42 -; CHECK-NEXT: v_writelane_b32 v0, s76, 43 -; CHECK-NEXT: v_writelane_b32 v0, s77, 44 -; CHECK-NEXT: v_writelane_b32 v0, s78, 45 -; CHECK-NEXT: v_writelane_b32 v0, s79, 46 -; CHECK-NEXT: v_writelane_b32 v0, s80, 47 -; CHECK-NEXT: v_writelane_b32 v0, s81, 48 -; CHECK-NEXT: v_writelane_b32 v0, s82, 49 -; CHECK-NEXT: v_writelane_b32 v0, s83, 50 -; CHECK-NEXT: v_writelane_b32 v0, s84, 51 -; CHECK-NEXT: v_writelane_b32 v0, s85, 52 -; CHECK-NEXT: v_writelane_b32 v0, s86, 53 -; CHECK-NEXT: v_writelane_b32 v0, s87, 54 -; CHECK-NEXT: v_writelane_b32 v0, s88, 55 -; CHECK-NEXT: v_writelane_b32 v0, s89, 56 -; CHECK-NEXT: v_writelane_b32 v0, s90, 57 -; CHECK-NEXT: v_writelane_b32 v0, s91, 58 -; CHECK-NEXT: v_writelane_b32 v0, s92, 59 -; CHECK-NEXT: v_writelane_b32 v0, s93, 60 -; CHECK-NEXT: v_writelane_b32 v0, s94, 61 -; CHECK-NEXT: v_writelane_b32 v0, s95, 62 -; CHECK-NEXT: v_writelane_b32 v0, s96, 63 -; CHECK-NEXT: s_waitcnt expcnt(0) -; CHECK-NEXT: v_writelane_b32 v1, s97, 0 -; CHECK-NEXT: v_writelane_b32 v1, s98, 1 -; CHECK-NEXT: v_writelane_b32 v1, s99, 2 -; CHECK-NEXT: v_writelane_b32 v1, s100, 3 -; CHECK-NEXT: v_writelane_b32 v1, s101, 4 -; CHECK-NEXT: v_writelane_b32 v1, s30, 5 -; CHECK-NEXT: v_writelane_b32 v1, s31, 6 -; CHECK-NEXT: s_mov_b32 s31, s12 -; CHECK-NEXT: s_cmp_eq_u32 s31, 0 +; CHECK-NEXT: v_writelane_b32 v0, s48, 7 +; CHECK-NEXT: v_writelane_b32 v0, s49, 8 +; CHECK-NEXT: v_writelane_b32 v0, s50, 9 +; CHECK-NEXT: v_writelane_b32 v0, s51, 10 +; CHECK-NEXT: v_writelane_b32 v0, s52, 11 +; CHECK-NEXT: v_writelane_b32 v0, s53, 12 +; CHECK-NEXT: v_writelane_b32 v0, s54, 13 +; CHECK-NEXT: v_writelane_b32 v0, s55, 14 +; CHECK-NEXT: v_writelane_b32 v0, s64, 15 +; CHECK-NEXT: v_writelane_b32 v0, s65, 16 +; CHECK-NEXT: v_writelane_b32 v0, s66, 17 +; CHECK-NEXT: v_writelane_b32 v0, s67, 18 +; CHECK-NEXT: v_writelane_b32 v0, s68, 19 +; CHECK-NEXT: v_writelane_b32 v0, s69, 20 +; CHECK-NEXT: v_writelane_b32 v0, s70, 21 +; CHECK-NEXT: v_writelane_b32 v0, s71, 22 +; CHECK-NEXT: v_writelane_b32 v0, s80, 23 +; CHECK-NEXT: v_writelane_b32 v0, s81, 24 +; CHECK-NEXT: v_writelane_b32 v0, s82, 25 +; CHECK-NEXT: v_writelane_b32 v0, s83, 26 +; CHECK-NEXT: v_writelane_b32 v0, s84, 27 +; CHECK-NEXT: v_writelane_b32 v0, s85, 28 +; CHECK-NEXT: v_writelane_b32 v0, s86, 29 +; CHECK-NEXT: v_writelane_b32 v0, s87, 30 +; CHECK-NEXT: v_writelane_b32 v0, s96, 31 +; CHECK-NEXT: v_writelane_b32 v0, s97, 32 +; CHECK-NEXT: v_writelane_b32 v0, s98, 33 +; CHECK-NEXT: v_writelane_b32 v0, s99, 34 +; CHECK-NEXT: v_writelane_b32 v0, s100, 35 +; CHECK-NEXT: v_writelane_b32 v0, s101, 36 +; CHECK-NEXT: v_writelane_b32 v0, s30, 37 +; CHECK-NEXT: v_writelane_b32 v0, s31, 38 +; CHECK-NEXT: s_mov_b32 s40, s12 +; CHECK-NEXT: s_cmp_eq_u32 s40, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: ;;#ASMEND @@ -1292,9 +1258,9 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: s_cbranch_scc0 .LBB1_1 ; CHECK-NEXT: ; %bb.3: ; %entry ; CHECK-NEXT: s_not_b64 exec, exec -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; CHECK-NEXT: v_writelane_b32 v2, s0, 0 -; CHECK-NEXT: v_writelane_b32 v2, s1, 1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; CHECK-NEXT: v_writelane_b32 v1, s0, 0 +; CHECK-NEXT: v_writelane_b32 v1, s1, 1 ; CHECK-NEXT: s_getpc_b64 s[0:1] ; CHECK-NEXT: .Lpost_getpc1: ; CHECK-NEXT: s_add_u32 s0, s0, (.LBB1_4-.Lpost_getpc1)&4294967295 @@ -1313,9 +1279,9 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_branch .LBB1_2 ; CHECK-NEXT: .LBB1_4: ; %bb3 -; CHECK-NEXT: v_readlane_b32 s0, v2, 0 -; CHECK-NEXT: v_readlane_b32 s1, v2, 1 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; CHECK-NEXT: v_readlane_b32 s0, v1, 0 +; CHECK-NEXT: v_readlane_b32 s1, v1, 1 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; CHECK-NEXT: s_not_b64 exec, exec ; CHECK-NEXT: .LBB1_2: ; %bb3 ; CHECK-NEXT: ;;#ASMSTART @@ -1414,7 +1380,7 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; reg use s31 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s30, v1, 5 +; CHECK-NEXT: v_readlane_b32 s30, v0, 37 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; reg use s32 ; CHECK-NEXT: ;;#ASMEND @@ -1631,69 +1597,37 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; reg use vcc_hi ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s31, v1, 6 -; CHECK-NEXT: v_readlane_b32 s101, v1, 4 -; CHECK-NEXT: v_readlane_b32 s100, v1, 3 -; CHECK-NEXT: v_readlane_b32 s99, v1, 2 -; CHECK-NEXT: v_readlane_b32 s98, v1, 1 -; CHECK-NEXT: v_readlane_b32 s97, v1, 0 -; CHECK-NEXT: v_readlane_b32 s96, v0, 63 -; CHECK-NEXT: v_readlane_b32 s95, v0, 62 -; CHECK-NEXT: v_readlane_b32 s94, v0, 61 -; CHECK-NEXT: v_readlane_b32 s93, v0, 60 -; CHECK-NEXT: v_readlane_b32 s92, v0, 59 -; CHECK-NEXT: v_readlane_b32 s91, v0, 58 -; CHECK-NEXT: v_readlane_b32 s90, v0, 57 -; CHECK-NEXT: v_readlane_b32 s89, v0, 56 -; CHECK-NEXT: v_readlane_b32 s88, v0, 55 -; CHECK-NEXT: v_readlane_b32 s87, v0, 54 -; CHECK-NEXT: v_readlane_b32 s86, v0, 53 -; CHECK-NEXT: v_readlane_b32 s85, v0, 52 -; CHECK-NEXT: v_readlane_b32 s84, v0, 51 -; CHECK-NEXT: v_readlane_b32 s83, v0, 50 -; CHECK-NEXT: v_readlane_b32 s82, v0, 49 -; CHECK-NEXT: v_readlane_b32 s81, v0, 48 -; CHECK-NEXT: v_readlane_b32 s80, v0, 47 -; CHECK-NEXT: v_readlane_b32 s79, v0, 46 -; CHECK-NEXT: v_readlane_b32 s78, v0, 45 -; CHECK-NEXT: v_readlane_b32 s77, v0, 44 -; CHECK-NEXT: v_readlane_b32 s76, v0, 43 -; CHECK-NEXT: v_readlane_b32 s75, v0, 42 -; CHECK-NEXT: v_readlane_b32 s74, v0, 41 -; CHECK-NEXT: v_readlane_b32 s73, v0, 40 -; CHECK-NEXT: v_readlane_b32 s72, v0, 39 -; CHECK-NEXT: v_readlane_b32 s71, v0, 38 -; CHECK-NEXT: v_readlane_b32 s70, v0, 37 -; CHECK-NEXT: v_readlane_b32 s69, v0, 36 -; CHECK-NEXT: v_readlane_b32 s68, v0, 35 -; CHECK-NEXT: v_readlane_b32 s67, v0, 34 -; CHECK-NEXT: v_readlane_b32 s66, v0, 33 -; CHECK-NEXT: v_readlane_b32 s65, v0, 32 -; CHECK-NEXT: v_readlane_b32 s64, v0, 31 -; CHECK-NEXT: v_readlane_b32 s63, v0, 30 -; CHECK-NEXT: v_readlane_b32 s62, v0, 29 -; CHECK-NEXT: v_readlane_b32 s61, v0, 28 -; CHECK-NEXT: v_readlane_b32 s60, v0, 27 -; CHECK-NEXT: v_readlane_b32 s59, v0, 26 -; CHECK-NEXT: v_readlane_b32 s58, v0, 25 -; CHECK-NEXT: v_readlane_b32 s57, v0, 24 -; CHECK-NEXT: v_readlane_b32 s56, v0, 23 -; CHECK-NEXT: v_readlane_b32 s55, v0, 22 -; CHECK-NEXT: v_readlane_b32 s54, v0, 21 -; CHECK-NEXT: v_readlane_b32 s53, v0, 20 -; CHECK-NEXT: v_readlane_b32 s52, v0, 19 -; CHECK-NEXT: v_readlane_b32 s51, v0, 18 -; CHECK-NEXT: v_readlane_b32 s50, v0, 17 -; CHECK-NEXT: v_readlane_b32 s49, v0, 16 -; CHECK-NEXT: v_readlane_b32 s48, v0, 15 -; CHECK-NEXT: v_readlane_b32 s47, v0, 14 -; CHECK-NEXT: v_readlane_b32 s46, v0, 13 -; CHECK-NEXT: v_readlane_b32 s45, v0, 12 -; CHECK-NEXT: v_readlane_b32 s44, v0, 11 -; CHECK-NEXT: v_readlane_b32 s43, v0, 10 -; CHECK-NEXT: v_readlane_b32 s42, v0, 9 -; CHECK-NEXT: v_readlane_b32 s41, v0, 8 -; CHECK-NEXT: v_readlane_b32 s40, v0, 7 +; CHECK-NEXT: v_readlane_b32 s31, v0, 38 +; CHECK-NEXT: v_readlane_b32 s101, v0, 36 +; CHECK-NEXT: v_readlane_b32 s100, v0, 35 +; CHECK-NEXT: v_readlane_b32 s99, v0, 34 +; CHECK-NEXT: v_readlane_b32 s98, v0, 33 +; CHECK-NEXT: v_readlane_b32 s97, v0, 32 +; CHECK-NEXT: v_readlane_b32 s96, v0, 31 +; CHECK-NEXT: v_readlane_b32 s87, v0, 30 +; CHECK-NEXT: v_readlane_b32 s86, v0, 29 +; CHECK-NEXT: v_readlane_b32 s85, v0, 28 +; CHECK-NEXT: v_readlane_b32 s84, v0, 27 +; CHECK-NEXT: v_readlane_b32 s83, v0, 26 +; CHECK-NEXT: v_readlane_b32 s82, v0, 25 +; CHECK-NEXT: v_readlane_b32 s81, v0, 24 +; CHECK-NEXT: v_readlane_b32 s80, v0, 23 +; CHECK-NEXT: v_readlane_b32 s71, v0, 22 +; CHECK-NEXT: v_readlane_b32 s70, v0, 21 +; CHECK-NEXT: v_readlane_b32 s69, v0, 20 +; CHECK-NEXT: v_readlane_b32 s68, v0, 19 +; CHECK-NEXT: v_readlane_b32 s67, v0, 18 +; CHECK-NEXT: v_readlane_b32 s66, v0, 17 +; CHECK-NEXT: v_readlane_b32 s65, v0, 16 +; CHECK-NEXT: v_readlane_b32 s64, v0, 15 +; CHECK-NEXT: v_readlane_b32 s55, v0, 14 +; CHECK-NEXT: v_readlane_b32 s54, v0, 13 +; CHECK-NEXT: v_readlane_b32 s53, v0, 12 +; CHECK-NEXT: v_readlane_b32 s52, v0, 11 +; CHECK-NEXT: v_readlane_b32 s51, v0, 10 +; CHECK-NEXT: v_readlane_b32 s50, v0, 9 +; CHECK-NEXT: v_readlane_b32 s49, v0, 8 +; CHECK-NEXT: v_readlane_b32 s48, v0, 7 ; CHECK-NEXT: v_readlane_b32 s39, v0, 6 ; CHECK-NEXT: v_readlane_b32 s38, v0, 5 ; CHECK-NEXT: v_readlane_b32 s37, v0, 4 @@ -1703,7 +1637,6 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s33, v0, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index ff47c865c67e6..a03ad4daab014 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -417,12 +417,13 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: v_cmpx_ne_u32_e32 0, v2 @@ -1495,7 +1496,6 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index a969e3d4f4f79..0ea73ad4c5019 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -382,10 +382,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -394,7 +394,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: @@ -402,7 +401,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: @@ -1894,14 +1892,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2145,14 +2141,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -2376,10 +2370,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2387,7 +2381,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: @@ -2409,17 +2402,16 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB10_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 @@ -2434,7 +2426,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB10_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2921,14 +2912,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: @@ -3192,14 +3181,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -3439,7 +3426,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -3468,7 +3454,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3790,7 +3775,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start @@ -3818,7 +3802,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4133,17 +4116,16 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: @@ -4170,17 +4152,16 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 @@ -4196,7 +4177,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4779,7 +4759,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start @@ -4795,6 +4774,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -4816,7 +4796,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5185,7 +5164,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start @@ -5201,6 +5179,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -5221,7 +5200,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5582,17 +5560,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: @@ -5614,6 +5591,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -5629,17 +5607,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 @@ -5655,7 +5632,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6759,10 +6735,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6771,7 +6747,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: @@ -6779,7 +6754,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -9080,10 +9054,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -9092,7 +9066,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: @@ -9100,7 +9073,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index c7511a2df9fe1..7f06d169a6b13 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -374,10 +374,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -386,7 +386,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: buffer_atomic_max_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: @@ -394,7 +393,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -1220,14 +1218,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1413,14 +1409,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1589,10 +1583,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1600,7 +1594,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: @@ -1624,17 +1617,16 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 @@ -1649,7 +1641,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2034,14 +2025,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: @@ -2323,14 +2312,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2511,7 +2498,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start @@ -2541,7 +2527,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2877,7 +2862,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start @@ -2906,7 +2890,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3234,17 +3217,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: @@ -3274,17 +3256,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 @@ -3300,7 +3281,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3896,7 +3876,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -3912,6 +3891,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -3933,7 +3913,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4304,7 +4283,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start @@ -4320,6 +4298,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -4340,7 +4319,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4703,17 +4681,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: @@ -4735,6 +4712,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -4750,17 +4728,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 @@ -4776,7 +4753,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5418,14 +5394,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5738,14 +5712,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6038,10 +6010,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6049,7 +6021,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: @@ -6073,17 +6044,16 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 @@ -6099,7 +6069,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6673,6 +6642,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2 @@ -6681,6 +6651,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 @@ -6690,14 +6661,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7099,8 +7068,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 @@ -7110,14 +7081,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7494,10 +7463,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7505,7 +7474,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: @@ -7531,12 +7499,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5 ; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 @@ -7545,17 +7515,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 @@ -7571,7 +7540,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 0bcaacc6b08e8..a6eb81fcbf515 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -374,10 +374,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -386,7 +386,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: buffer_atomic_min_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: @@ -394,7 +393,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -1220,14 +1218,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1413,14 +1409,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: @@ -1589,10 +1583,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1600,7 +1594,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: @@ -1624,17 +1617,16 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s5, v10 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB7_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 @@ -1649,7 +1641,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -2034,14 +2025,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: @@ -2323,14 +2312,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -2511,7 +2498,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start @@ -2541,7 +2527,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2877,7 +2862,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start @@ -2906,7 +2890,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3234,17 +3217,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: @@ -3274,17 +3256,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 @@ -3300,7 +3281,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3896,7 +3876,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -3912,6 +3891,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -3933,7 +3913,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4304,7 +4283,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start @@ -4320,6 +4298,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -4340,7 +4319,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4703,17 +4681,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: @@ -4735,6 +4712,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -4750,17 +4728,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 @@ -4776,7 +4753,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5418,14 +5394,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -5738,14 +5712,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6038,10 +6010,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6049,7 +6021,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: @@ -6073,17 +6044,16 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 @@ -6099,7 +6069,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6673,6 +6642,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 ; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2 @@ -6681,6 +6651,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 @@ -6690,14 +6661,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7099,8 +7068,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 @@ -7110,14 +7081,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: @@ -7494,10 +7463,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7505,7 +7474,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: @@ -7531,12 +7499,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5 ; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 @@ -7545,17 +7515,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 @@ -7571,7 +7540,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/bug-undef-spilled-agpr.mir b/llvm/test/CodeGen/AMDGPU/bug-undef-spilled-agpr.mir new file mode 100644 index 0000000000000..72b6b9f9ec686 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bug-undef-spilled-agpr.mir @@ -0,0 +1,103 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -run-pass=si-lower-sgpr-spills,greedy,si-lower-wwm-copies,virtregrewriter,prologepilog -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: widget +tracksRegLiveness: true +frameInfo: + adjustsStack: true +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } + - { id: 1, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + hasSpilledSGPRs: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' +body: | + ; GCN-LABEL: name: widget + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $agpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr0 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr14 + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GCN-NEXT: frame-setup CFI_INSTRUCTION offset $agpr0, 0 + ; GCN-NEXT: $exec = S_MOV_B64 -1 + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; GCN-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr62, 256 + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: renamable $vgpr62 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr15, 0, killed $vgpr62 + ; GCN-NEXT: $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: renamable $agpr0 = COPY killed renamable $vgpr62 + ; GCN-NEXT: $exec = S_MOV_B64 killed $noreg + ; GCN-NEXT: renamable $vgpr62 = IMPLICIT_DEF + ; GCN-NEXT: dead renamable $vgpr62 = V_AND_B32_e32 1, killed $vgpr62, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $agpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000) + ; GCN-NEXT: liveins: $agpr0, $sgpr86, $sgpr87, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr80_sgpr81, $sgpr82_sgpr83, $sgpr84_sgpr85, $sgpr96_sgpr97, $sgpr98_sgpr99 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $agpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: renamable $vgpr62 = COPY renamable $agpr0 + ; GCN-NEXT: $exec = S_MOV_B64 killed $noreg + ; GCN-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr62, 1 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.4: + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GCN-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec + ; GCN-NEXT: $exec = S_MOV_B64 -1 + ; GCN-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: SI_RETURN + bb.0: + liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15 + + %45:vgpr_32 = IMPLICIT_DEF + SI_SPILL_S32_SAVE $sgpr15, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) + %16:vgpr_32 = V_AND_B32_e32 1, %45, implicit $exec + + bb.1: + successors: %bb.3, %bb.2 + + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.3 + + bb.2: + successors: %bb.4(0x04000000), %bb.1(0x7c000000) + liveins: $sgpr86, $sgpr87, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr80_sgpr81, $sgpr82_sgpr83, $sgpr84_sgpr85, $sgpr96_sgpr97, $sgpr98_sgpr99 + + S_CBRANCH_EXECNZ %bb.1, implicit $exec + S_BRANCH %bb.4 + + bb.3: + ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + $sgpr14 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) + ADJCALLSTACKDOWN 0, 28, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + S_BRANCH %bb.2 + + bb.4: + SI_RETURN + +... diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll index d4c50cf2c7e4a..34f4476f7fd6a 100644 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll @@ -1,6 +1,6 @@ -; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s -; CHECK: LLVM ERROR: failed to find free scratch register +; CHECK: illegal VGPR to SGPR copy declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0 declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0 diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll index a939f4b2bfa78..31f6a1db8123f 100644 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll @@ -1385,9 +1385,9 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) # ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s29, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 vcc, -1 +; GFX9-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, vcc +; GFX9-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-NEXT: v_writelane_b32 v40, s29, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -1405,10 +1405,10 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) # ; GFX9-NEXT: s_mov_b32 s22, s26 ; GFX9-NEXT: s_mov_b32 s23, s27 ; GFX9-NEXT: s_mov_b32 s24, s28 -; GFX9-NEXT: s_getpc_b64 vcc -; GFX9-NEXT: s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 vcc_hi, vcc_hi, external_void_func_a15i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], vcc +; GFX9-NEXT: s_getpc_b64 s[40:41] +; GFX9-NEXT: s_add_u32 s40, s40, external_void_func_a15i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s41, s41, external_void_func_a15i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[40:41] ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index cd4df4f5c79aa..bfcf5be861777 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -4594,16 +4594,16 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %out) #0 { ; VI-LABEL: test_call_external_i32_func_i32_imm: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_mov_b32 s42, -1 -; VI-NEXT: s_mov_b32 s43, 0xe80000 -; VI-NEXT: s_add_u32 s40, s40, s5 +; VI-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s50, -1 +; VI-NEXT: s_mov_b32 s51, 0xe80000 +; VI-NEXT: s_add_u32 s48, s48, s5 ; VI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 -; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_addc_u32 s49, s49, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[40:41] -; VI-NEXT: s_mov_b64 s[2:3], s[42:43] +; VI-NEXT: s_mov_b64 s[0:1], s[48:49] +; VI-NEXT: s_mov_b64 s[2:3], s[50:51] ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_mov_b32 s39, 0xf000 @@ -4618,16 +4618,16 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; ; CI-LABEL: test_call_external_i32_func_i32_imm: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; CI-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; CI-NEXT: s_mov_b32 s42, -1 -; CI-NEXT: s_mov_b32 s43, 0xe8f000 -; CI-NEXT: s_add_u32 s40, s40, s5 +; CI-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s50, -1 +; CI-NEXT: s_mov_b32 s51, 0xe8f000 +; CI-NEXT: s_add_u32 s48, s48, s5 ; CI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; CI-NEXT: s_addc_u32 s41, s41, 0 +; CI-NEXT: s_addc_u32 s49, s49, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[40:41] -; CI-NEXT: s_mov_b64 s[2:3], s[42:43] +; CI-NEXT: s_mov_b64 s[0:1], s[48:49] +; CI-NEXT: s_mov_b64 s[2:3], s[50:51] ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_mov_b32 s39, 0xf000 @@ -4642,16 +4642,16 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; ; GFX9-LABEL: test_call_external_i32_func_i32_imm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s42, -1 -; GFX9-NEXT: s_mov_b32 s43, 0xe00000 -; GFX9-NEXT: s_add_u32 s40, s40, s5 +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s5 ; GFX9-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 -; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_mov_b32 s39, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll index c4d54fa490e0c..e17fad8e03521 100644 --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s @@ -5,110 +6,258 @@ declare hidden void @external_void_func_void() #3 -; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: s_getpc_b64 s[34:35] -; GCN-NEXT: s_add_u32 s34, s34, -; GCN-NEXT: s_addc_u32 s35, s35, -; GCN: s_swappc_b64 s[30:31], s[34:35] - -; GCN-NEXT: #ASMSTART -; GCN-NEXT: #ASMEND -; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35] define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { +; FLATSCR-LABEL: test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: s_getpc_b64 s[34:35] +; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: s_endpgm call void @external_void_func_void() call void asm sideeffect "", ""() #0 call void @external_void_func_void() ret void } -; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; MUBUF: buffer_store_dword -; FLATSCR: scratch_store_dword -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 -; GCN: v_writelane_b32 v40, s34, 0 -; GCN: v_writelane_b32 v40, s35, 1 -; GCN: v_writelane_b32 v40, s30, 2 -; GCN: v_writelane_b32 v40, s31, 3 - -; GCN: s_swappc_b64 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_swappc_b64 -; GCN: v_readlane_b32 s30, v40, 2 -; GCN: v_readlane_b32 s31, v40, 3 -; MUBUF-DAG: v_readlane_b32 s35, v40, 1 -; MUBUF-DAG: v_readlane_b32 s34, v40, 0 -; FLATSCR-DAG: v_readlane_b32 s35, v40, 1 -; FLATSCR-DAG: v_readlane_b32 s34, v40, 0 - -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 -; MUBUF: buffer_load_dword -; FLATSCR: scratch_load_dword -; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN: s_setpc_b64 s[30:31] define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { +; MUBUF-LABEL: test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s4, 4 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s34, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s35, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 3 +; MUBUF-NEXT: s_getpc_b64 s[34:35] +; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: v_readlane_b32 s30, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s35, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s34, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 4 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s35, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 3 +; FLATSCR-NEXT: s_getpc_b64 s[34:35] +; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s35, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_void() call void asm sideeffect "", ""() #0 call void @external_void_func_void() ret void } -; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_mov_b32 s33, s32 -; MUBUF: buffer_store_dword v40 -; FLATSCR: scratch_store_dword off, v40 -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 -; MUBUF: s_addk_i32 s32, 0x400 -; FLATSCR: s_add_i32 s32, s32, 16 - -; GCN: s_swappc_b64 -; GCN-NEXT: s_swappc_b64 - -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 -; MUBUF: buffer_load_dword v40 -; FLATSCR: scratch_load_dword v40 -; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] define void @test_func_call_external_void_funcx2() #0 { +; MUBUF-LABEL: test_func_call_external_void_funcx2: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s4, 4 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s34, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s35, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 3 +; MUBUF-NEXT: s_getpc_b64 s[34:35] +; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: v_readlane_b32 s30, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s35, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s34, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 4 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: test_func_call_external_void_funcx2: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s35, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 3 +; FLATSCR-NEXT: s_getpc_b64 s[34:35] +; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s35, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_void() call void @external_void_func_void() ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_s30_s31: -; GCN: s_waitcnt -; GCN: v_writelane_b32 v0, s30, 0 -; GCN: v_writelane_b32 v0, s31, 1 -; GCN-NEXT: #ASMSTART -; GCN: ; clobber -; GCN-NEXT: #ASMEND -; GCN: v_readlane_b32 s30, v0, 0 -; GCN: v_readlane_b32 s31, v0, 1 -; GCN: s_setpc_b64 s[30:31] define void @void_func_void_clobber_s30_s31() #2 { +; MUBUF-LABEL: void_func_void_clobber_s30_s31: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v0, s31, 1 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s30, v0, 0 +; MUBUF-NEXT: v_readlane_b32 s31, v0, 1 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: void_func_void_clobber_s30_s31: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v0, s31, 1 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s30, v0, 0 +; FLATSCR-NEXT: v_readlane_b32 s31, v0, 1 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s[30:31]}"() #0 ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_vcc: -; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_setpc_b64 s[30:31] define hidden void @void_func_void_clobber_vcc() #2 { +; GCN-LABEL: void_func_void_clobber_vcc: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{vcc}"() #0 ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_clobber_vcc: -; GCN: s_mov_b64 s[34:35], vcc -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_swappc_b64 -; GCN: s_mov_b64 vcc, s[34:35] define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_clobber_vcc: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_add_u32 s8, s4, 8 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLATSCR-NEXT: s_mov_b32 s14, s12 +; FLATSCR-NEXT: s_mov_b32 s13, s11 +; FLATSCR-NEXT: s_mov_b32 s12, s10 +; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLATSCR-NEXT: s_addc_u32 s9, s5, 0 +; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def vcc +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b64 s[34:35], vcc +; FLATSCR-NEXT: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_vcc@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_vcc@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b64 vcc, s[34:35] +; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1 +; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use vcc +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %vcc = call i64 asm sideeffect "; def $0", "={vcc}"() call void @void_func_void_clobber_vcc() %val0 = load volatile i32, ptr addrspace(1) undef @@ -117,22 +266,50 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31: -; GCN: s_mov_b32 s33, s31 -; GCN: s_swappc_b64 -; GCN-NEXT: s_mov_b32 s31, s33 define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_mayclobber_s31: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s33, s31 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: s_mov_b32 s31, s33 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s31 = call i32 asm sideeffect "; def $0", "={s31}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{s31}"(i32 %s31) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31: -; GCN: v_mov_b32_e32 v40, v31 -; GCN: s_swappc_b64 -; GCN-NEXT: v_mov_b32_e32 v31, v40 define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_mayclobber_v31: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_mov_b32_e32 v40, v31 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: v_mov_b32_e32 v31, v40 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %v31 = call i32 asm sideeffect "; def $0", "={v31}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{v31}"(i32 %v31) @@ -140,169 +317,297 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(ptr addrspace } ; FIXME: What is the expected behavior for reserved registers here? - -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: -; GCN: #ASMSTART -; GCN-NEXT: ; def s33 -; GCN-NEXT: #ASMEND -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN: ;;#ASMSTART -; GCN-NEXT: ; use s33 -; GCN-NEXT: ;;#ASMEND -; GCN-NOT: s33 -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_s33(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_preserves_s33: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s33 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s33 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s33 = call i32 asm sideeffect "; def $0", "={s33}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{s33}"(i32 %s33) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}} -; GCN-NOT: s34 - -; GCN: s_mov_b32 s32, 0 - -; GCN-NOT: s34 -; GCN: ;;#ASMSTART -; GCN-NEXT: ; def s34 -; GCN-NEXT: ;;#ASMEND -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 - -; GCN-NOT: s34 -; MUBUF: s_swappc_b64 s[30:31], s[4:5] -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] - -; GCN-NOT: s34 - -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; use s34 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_s34(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_preserves_s34: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s34 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s34 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s34 = call i32 asm sideeffect "; def $0", "={s34}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{s34}"(i32 %s34) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}} - -; GCN-NOT: v32 -; GCN: s_mov_b32 s32, 0 -; GCN-NOT: v40 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; def v40 -; GCN-NEXT: ;;#ASMEND -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 - -; MUBUF: s_swappc_b64 s[30:31], s[4:5] -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] - -; GCN-NOT: v40 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; use v40 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_v40(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_preserves_v40: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %v40 = call i32 asm sideeffect "; def $0", "={v40}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{v40}"(i32 %v40) ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_s33: -; GCN: v_writelane_b32 v0, s33, 0 -; GCN-NEXT: #ASMSTART -; GCN-NEXT: ; clobber -; GCN-NEXT: #ASMEND -; GCN-NEXT: v_readlane_b32 s33, v0, 0 -; GCN: s_setpc_b64 define hidden void @void_func_void_clobber_s33() #2 { +; MUBUF-LABEL: void_func_void_clobber_s33: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s33, 0 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s33, v0, 0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: void_func_void_clobber_s33: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s33, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s33, v0, 0 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s33}"() #0 ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_s34: -; GCN: v_writelane_b32 v0, s34, 0 -; GCN-NEXT: #ASMSTART -; GCN-NEXT: ; clobber -; GCN-NEXT: #ASMEND -; GCN-NEXT: v_readlane_b32 s34, v0, 0 -; GCN: s_setpc_b64 define hidden void @void_func_void_clobber_s34() #2 { +; MUBUF-LABEL: void_func_void_clobber_s34: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s34, 0 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s34, v0, 0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: void_func_void_clobber_s34: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s34, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s34, v0, 0 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s34}"() #0 ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33: -; GCN: s_mov_b32 s32, 0 -; GCN: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN: s_swappc_b64 -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { +; FLATSCR-LABEL: test_call_void_func_void_clobber_s33: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_mov_b32 s14, s12 +; FLATSCR-NEXT: s_mov_b32 s13, s11 +; FLATSCR-NEXT: s_mov_b32 s12, s10 +; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5] +; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s33@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s33@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FLATSCR-NEXT: s_endpgm call void @void_func_void_clobber_s33() ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34: -; GCN: s_mov_b32 s32, 0 -; GCN: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN: s_swappc_b64 -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 { +; FLATSCR-LABEL: test_call_void_func_void_clobber_s34: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_mov_b32 s14, s12 +; FLATSCR-NEXT: s_mov_b32 s13, s11 +; FLATSCR-NEXT: s_mov_b32 s12, s10 +; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5] +; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s34@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s34@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FLATSCR-NEXT: s_endpgm call void @void_func_void_clobber_s34() ret void } -; GCN-LABEL: {{^}}callee_saved_sgpr_func: -; GCN-NOT: s40 -; GCN: v_writelane_b32 v40, s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 -; GCN: v_readlane_b32 s40, v40 -; GCN-NOT: s40 define void @callee_saved_sgpr_func() #2 { +; MUBUF-LABEL: callee_saved_sgpr_func: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s4, 3 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s34, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 2 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_mov_b32 s34, s40 +; MUBUF-NEXT: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MUBUF-NEXT: v_readlane_b32 s30, v40, 1 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s34 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s31, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s34, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 3 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_saved_sgpr_func: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 3 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 2 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s34, s40 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 1 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s34 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 3 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 call void @external_void_func_void() call void asm sideeffect "; use $0", "s"(i32 %s40) #0 ret void } -; GCN-LABEL: {{^}}callee_saved_sgpr_kernel: -; GCN-NOT: s40 -; GCN: ; def s40 -; GCN-NOT: s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 { +; FLATSCR-LABEL: callee_saved_sgpr_kernel: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s33, s40 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s33 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 call void @external_void_func_void() call void asm sideeffect "; use $0", "s"(i32 %s40) #0 @@ -310,16 +615,94 @@ define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 { } ; First call preserved VGPR is used so it can't be used for SGPR spills. -; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func: -; GCN-NOT: s40 -; GCN: v_writelane_b32 v41, s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 -; GCN: v_readlane_b32 s40, v41 -; GCN-NOT: s40 define void @callee_saved_sgpr_vgpr_func() #2 { +; MUBUF-LABEL: callee_saved_sgpr_vgpr_func: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v41, s4, 3 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 v41, s34, 0 +; MUBUF-NEXT: v_writelane_b32 v41, s30, 1 +; MUBUF-NEXT: v_writelane_b32 v41, s31, 2 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_mov_b32 s34, s40 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s34 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: v_readlane_b32 s30, v41, 1 +; MUBUF-NEXT: v_readlane_b32 s31, v41, 2 +; MUBUF-NEXT: v_readlane_b32 s34, v41, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v41, 3 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_saved_sgpr_vgpr_func: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v41, s0, 3 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: v_writelane_b32 v41, s34, 0 +; FLATSCR-NEXT: v_writelane_b32 v41, s30, 1 +; FLATSCR-NEXT: v_writelane_b32 v41, s31, 2 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s34, s40 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s34 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: v_readlane_b32 s30, v41, 1 +; FLATSCR-NEXT: v_readlane_b32 s31, v41, 2 +; FLATSCR-NEXT: v_readlane_b32 s34, v41, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v41, 3 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 %v40 = call i32 asm sideeffect "; def v40", "={v40}"() #0 call void @external_void_func_void() @@ -328,15 +711,31 @@ define void @callee_saved_sgpr_vgpr_func() #2 { ret void } -; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_kernel: -; GCN-NOT: s40 -; GCN: ; def s40 -; GCN-NOT: s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 define amdgpu_kernel void @callee_saved_sgpr_vgpr_kernel() #2 { +; FLATSCR-LABEL: callee_saved_sgpr_vgpr_kernel: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s33, s40 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v32 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_mov_b32_e32 v40, v32 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s33 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0 call void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index 47c5d6a78c936..c9810b8fc6195 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -257,50 +257,26 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; MUBUF-NEXT: v_writelane_b32 v40, s37, 1 ; MUBUF-NEXT: v_writelane_b32 v40, s38, 2 ; MUBUF-NEXT: v_writelane_b32 v40, s39, 3 -; MUBUF-NEXT: v_writelane_b32 v40, s40, 4 -; MUBUF-NEXT: v_writelane_b32 v40, s41, 5 -; MUBUF-NEXT: v_writelane_b32 v40, s42, 6 -; MUBUF-NEXT: v_writelane_b32 v40, s43, 7 -; MUBUF-NEXT: v_writelane_b32 v40, s44, 8 -; MUBUF-NEXT: v_writelane_b32 v40, s45, 9 -; MUBUF-NEXT: v_writelane_b32 v40, s46, 10 -; MUBUF-NEXT: v_writelane_b32 v40, s47, 11 -; MUBUF-NEXT: v_writelane_b32 v40, s48, 12 -; MUBUF-NEXT: v_writelane_b32 v40, s49, 13 -; MUBUF-NEXT: v_writelane_b32 v40, s50, 14 -; MUBUF-NEXT: v_writelane_b32 v40, s51, 15 -; MUBUF-NEXT: v_writelane_b32 v40, s52, 16 -; MUBUF-NEXT: v_writelane_b32 v40, s53, 17 -; MUBUF-NEXT: v_writelane_b32 v40, s54, 18 -; MUBUF-NEXT: v_writelane_b32 v40, s55, 19 -; MUBUF-NEXT: v_writelane_b32 v40, s56, 20 -; MUBUF-NEXT: v_writelane_b32 v40, s57, 21 -; MUBUF-NEXT: v_writelane_b32 v40, s58, 22 -; MUBUF-NEXT: v_writelane_b32 v40, s59, 23 -; MUBUF-NEXT: v_writelane_b32 v40, s60, 24 -; MUBUF-NEXT: v_writelane_b32 v40, s61, 25 -; MUBUF-NEXT: v_writelane_b32 v40, s62, 26 -; MUBUF-NEXT: v_writelane_b32 v40, s63, 27 -; MUBUF-NEXT: v_writelane_b32 v40, s64, 28 -; MUBUF-NEXT: v_writelane_b32 v40, s65, 29 -; MUBUF-NEXT: v_writelane_b32 v40, s66, 30 -; MUBUF-NEXT: v_writelane_b32 v40, s67, 31 -; MUBUF-NEXT: v_writelane_b32 v40, s68, 32 -; MUBUF-NEXT: v_writelane_b32 v40, s69, 33 -; MUBUF-NEXT: v_writelane_b32 v40, s70, 34 -; MUBUF-NEXT: v_writelane_b32 v40, s71, 35 -; MUBUF-NEXT: v_writelane_b32 v40, s72, 36 -; MUBUF-NEXT: v_writelane_b32 v40, s73, 37 -; MUBUF-NEXT: v_writelane_b32 v40, s74, 38 -; MUBUF-NEXT: v_writelane_b32 v40, s75, 39 -; MUBUF-NEXT: v_writelane_b32 v40, s76, 40 -; MUBUF-NEXT: v_writelane_b32 v40, s77, 41 -; MUBUF-NEXT: v_writelane_b32 v40, s78, 42 -; MUBUF-NEXT: v_writelane_b32 v40, s79, 43 -; MUBUF-NEXT: v_writelane_b32 v40, s80, 44 -; MUBUF-NEXT: v_writelane_b32 v40, s81, 45 -; MUBUF-NEXT: v_writelane_b32 v40, s82, 46 -; MUBUF-NEXT: v_writelane_b32 v40, s83, 47 +; MUBUF-NEXT: v_writelane_b32 v40, s48, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s49, 5 +; MUBUF-NEXT: v_writelane_b32 v40, s50, 6 +; MUBUF-NEXT: v_writelane_b32 v40, s51, 7 +; MUBUF-NEXT: v_writelane_b32 v40, s52, 8 +; MUBUF-NEXT: v_writelane_b32 v40, s53, 9 +; MUBUF-NEXT: v_writelane_b32 v40, s54, 10 +; MUBUF-NEXT: v_writelane_b32 v40, s55, 11 +; MUBUF-NEXT: v_writelane_b32 v40, s64, 12 +; MUBUF-NEXT: v_writelane_b32 v40, s65, 13 +; MUBUF-NEXT: v_writelane_b32 v40, s66, 14 +; MUBUF-NEXT: v_writelane_b32 v40, s67, 15 +; MUBUF-NEXT: v_writelane_b32 v40, s68, 16 +; MUBUF-NEXT: v_writelane_b32 v40, s69, 17 +; MUBUF-NEXT: v_writelane_b32 v40, s70, 18 +; MUBUF-NEXT: v_writelane_b32 v40, s71, 19 +; MUBUF-NEXT: v_writelane_b32 v40, s80, 20 +; MUBUF-NEXT: v_writelane_b32 v40, s81, 21 +; MUBUF-NEXT: v_writelane_b32 v40, s82, 22 +; MUBUF-NEXT: v_writelane_b32 v40, s83, 23 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART @@ -347,50 +323,26 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; use s[4:19] ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_readlane_b32 s83, v40, 47 -; MUBUF-NEXT: v_readlane_b32 s82, v40, 46 -; MUBUF-NEXT: v_readlane_b32 s81, v40, 45 -; MUBUF-NEXT: v_readlane_b32 s80, v40, 44 -; MUBUF-NEXT: v_readlane_b32 s79, v40, 43 -; MUBUF-NEXT: v_readlane_b32 s78, v40, 42 -; MUBUF-NEXT: v_readlane_b32 s77, v40, 41 -; MUBUF-NEXT: v_readlane_b32 s76, v40, 40 -; MUBUF-NEXT: v_readlane_b32 s75, v40, 39 -; MUBUF-NEXT: v_readlane_b32 s74, v40, 38 -; MUBUF-NEXT: v_readlane_b32 s73, v40, 37 -; MUBUF-NEXT: v_readlane_b32 s72, v40, 36 -; MUBUF-NEXT: v_readlane_b32 s71, v40, 35 -; MUBUF-NEXT: v_readlane_b32 s70, v40, 34 -; MUBUF-NEXT: v_readlane_b32 s69, v40, 33 -; MUBUF-NEXT: v_readlane_b32 s68, v40, 32 -; MUBUF-NEXT: v_readlane_b32 s67, v40, 31 -; MUBUF-NEXT: v_readlane_b32 s66, v40, 30 -; MUBUF-NEXT: v_readlane_b32 s65, v40, 29 -; MUBUF-NEXT: v_readlane_b32 s64, v40, 28 -; MUBUF-NEXT: v_readlane_b32 s63, v40, 27 -; MUBUF-NEXT: v_readlane_b32 s62, v40, 26 -; MUBUF-NEXT: v_readlane_b32 s61, v40, 25 -; MUBUF-NEXT: v_readlane_b32 s60, v40, 24 -; MUBUF-NEXT: v_readlane_b32 s59, v40, 23 -; MUBUF-NEXT: v_readlane_b32 s58, v40, 22 -; MUBUF-NEXT: v_readlane_b32 s57, v40, 21 -; MUBUF-NEXT: v_readlane_b32 s56, v40, 20 -; MUBUF-NEXT: v_readlane_b32 s55, v40, 19 -; MUBUF-NEXT: v_readlane_b32 s54, v40, 18 -; MUBUF-NEXT: v_readlane_b32 s53, v40, 17 -; MUBUF-NEXT: v_readlane_b32 s52, v40, 16 -; MUBUF-NEXT: v_readlane_b32 s51, v40, 15 -; MUBUF-NEXT: v_readlane_b32 s50, v40, 14 -; MUBUF-NEXT: v_readlane_b32 s49, v40, 13 -; MUBUF-NEXT: v_readlane_b32 s48, v40, 12 -; MUBUF-NEXT: v_readlane_b32 s47, v40, 11 -; MUBUF-NEXT: v_readlane_b32 s46, v40, 10 -; MUBUF-NEXT: v_readlane_b32 s45, v40, 9 -; MUBUF-NEXT: v_readlane_b32 s44, v40, 8 -; MUBUF-NEXT: v_readlane_b32 s43, v40, 7 -; MUBUF-NEXT: v_readlane_b32 s42, v40, 6 -; MUBUF-NEXT: v_readlane_b32 s41, v40, 5 -; MUBUF-NEXT: v_readlane_b32 s40, v40, 4 +; MUBUF-NEXT: v_readlane_b32 s83, v40, 23 +; MUBUF-NEXT: v_readlane_b32 s82, v40, 22 +; MUBUF-NEXT: v_readlane_b32 s81, v40, 21 +; MUBUF-NEXT: v_readlane_b32 s80, v40, 20 +; MUBUF-NEXT: v_readlane_b32 s71, v40, 19 +; MUBUF-NEXT: v_readlane_b32 s70, v40, 18 +; MUBUF-NEXT: v_readlane_b32 s69, v40, 17 +; MUBUF-NEXT: v_readlane_b32 s68, v40, 16 +; MUBUF-NEXT: v_readlane_b32 s67, v40, 15 +; MUBUF-NEXT: v_readlane_b32 s66, v40, 14 +; MUBUF-NEXT: v_readlane_b32 s65, v40, 13 +; MUBUF-NEXT: v_readlane_b32 s64, v40, 12 +; MUBUF-NEXT: v_readlane_b32 s55, v40, 11 +; MUBUF-NEXT: v_readlane_b32 s54, v40, 10 +; MUBUF-NEXT: v_readlane_b32 s53, v40, 9 +; MUBUF-NEXT: v_readlane_b32 s52, v40, 8 +; MUBUF-NEXT: v_readlane_b32 s51, v40, 7 +; MUBUF-NEXT: v_readlane_b32 s50, v40, 6 +; MUBUF-NEXT: v_readlane_b32 s49, v40, 5 +; MUBUF-NEXT: v_readlane_b32 s48, v40, 4 ; MUBUF-NEXT: v_readlane_b32 s39, v40, 3 ; MUBUF-NEXT: v_readlane_b32 s38, v40, 2 ; MUBUF-NEXT: v_readlane_b32 s37, v40, 1 @@ -407,50 +359,24 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0 -; FLATSCR-NEXT: v_writelane_b32 v40, s35, 1 -; FLATSCR-NEXT: v_writelane_b32 v40, s36, 2 -; FLATSCR-NEXT: v_writelane_b32 v40, s37, 3 -; FLATSCR-NEXT: v_writelane_b32 v40, s38, 4 -; FLATSCR-NEXT: v_writelane_b32 v40, s39, 5 -; FLATSCR-NEXT: v_writelane_b32 v40, s40, 6 -; FLATSCR-NEXT: v_writelane_b32 v40, s41, 7 -; FLATSCR-NEXT: v_writelane_b32 v40, s42, 8 -; FLATSCR-NEXT: v_writelane_b32 v40, s43, 9 -; FLATSCR-NEXT: v_writelane_b32 v40, s44, 10 -; FLATSCR-NEXT: v_writelane_b32 v40, s45, 11 -; FLATSCR-NEXT: v_writelane_b32 v40, s46, 12 -; FLATSCR-NEXT: v_writelane_b32 v40, s47, 13 -; FLATSCR-NEXT: v_writelane_b32 v40, s48, 14 -; FLATSCR-NEXT: v_writelane_b32 v40, s49, 15 -; FLATSCR-NEXT: v_writelane_b32 v40, s50, 16 -; FLATSCR-NEXT: v_writelane_b32 v40, s51, 17 -; FLATSCR-NEXT: v_writelane_b32 v40, s52, 18 -; FLATSCR-NEXT: v_writelane_b32 v40, s53, 19 -; FLATSCR-NEXT: v_writelane_b32 v40, s54, 20 -; FLATSCR-NEXT: v_writelane_b32 v40, s55, 21 -; FLATSCR-NEXT: v_writelane_b32 v40, s56, 22 -; FLATSCR-NEXT: v_writelane_b32 v40, s57, 23 -; FLATSCR-NEXT: v_writelane_b32 v40, s58, 24 -; FLATSCR-NEXT: v_writelane_b32 v40, s59, 25 -; FLATSCR-NEXT: v_writelane_b32 v40, s60, 26 -; FLATSCR-NEXT: v_writelane_b32 v40, s61, 27 -; FLATSCR-NEXT: v_writelane_b32 v40, s62, 28 -; FLATSCR-NEXT: v_writelane_b32 v40, s63, 29 -; FLATSCR-NEXT: v_writelane_b32 v40, s64, 30 -; FLATSCR-NEXT: v_writelane_b32 v40, s65, 31 -; FLATSCR-NEXT: v_writelane_b32 v40, s66, 32 -; FLATSCR-NEXT: v_writelane_b32 v40, s67, 33 -; FLATSCR-NEXT: v_writelane_b32 v40, s68, 34 -; FLATSCR-NEXT: v_writelane_b32 v40, s69, 35 -; FLATSCR-NEXT: v_writelane_b32 v40, s70, 36 -; FLATSCR-NEXT: v_writelane_b32 v40, s71, 37 -; FLATSCR-NEXT: v_writelane_b32 v40, s72, 38 -; FLATSCR-NEXT: v_writelane_b32 v40, s73, 39 -; FLATSCR-NEXT: v_writelane_b32 v40, s74, 40 -; FLATSCR-NEXT: v_writelane_b32 v40, s75, 41 -; FLATSCR-NEXT: v_writelane_b32 v40, s30, 42 -; FLATSCR-NEXT: v_writelane_b32 v40, s31, 43 +; FLATSCR-NEXT: v_writelane_b32 v40, s36, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s37, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s38, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s39, 3 +; FLATSCR-NEXT: v_writelane_b32 v40, s48, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s49, 5 +; FLATSCR-NEXT: v_writelane_b32 v40, s50, 6 +; FLATSCR-NEXT: v_writelane_b32 v40, s51, 7 +; FLATSCR-NEXT: v_writelane_b32 v40, s52, 8 +; FLATSCR-NEXT: v_writelane_b32 v40, s53, 9 +; FLATSCR-NEXT: v_writelane_b32 v40, s54, 10 +; FLATSCR-NEXT: v_writelane_b32 v40, s55, 11 +; FLATSCR-NEXT: v_writelane_b32 v40, s64, 12 +; FLATSCR-NEXT: v_writelane_b32 v40, s65, 13 +; FLATSCR-NEXT: v_writelane_b32 v40, s66, 14 +; FLATSCR-NEXT: v_writelane_b32 v40, s67, 15 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 17 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART @@ -474,10 +400,10 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; FLATSCR-NEXT: ; def s[0:15] ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; def s[68:75] +; FLATSCR-NEXT: ; def s[72:79] ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; def s[34:35] +; FLATSCR-NEXT: ; def s[88:89] ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; use s[52:67] @@ -488,59 +414,33 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; use s[16:31] ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s30, v40, 42 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 16 ; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; use s[68:75] +; FLATSCR-NEXT: ; use s[72:79] ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; use s[34:35] +; FLATSCR-NEXT: ; use s[88:89] ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; use s[0:15] ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s31, v40, 43 -; FLATSCR-NEXT: v_readlane_b32 s75, v40, 41 -; FLATSCR-NEXT: v_readlane_b32 s74, v40, 40 -; FLATSCR-NEXT: v_readlane_b32 s73, v40, 39 -; FLATSCR-NEXT: v_readlane_b32 s72, v40, 38 -; FLATSCR-NEXT: v_readlane_b32 s71, v40, 37 -; FLATSCR-NEXT: v_readlane_b32 s70, v40, 36 -; FLATSCR-NEXT: v_readlane_b32 s69, v40, 35 -; FLATSCR-NEXT: v_readlane_b32 s68, v40, 34 -; FLATSCR-NEXT: v_readlane_b32 s67, v40, 33 -; FLATSCR-NEXT: v_readlane_b32 s66, v40, 32 -; FLATSCR-NEXT: v_readlane_b32 s65, v40, 31 -; FLATSCR-NEXT: v_readlane_b32 s64, v40, 30 -; FLATSCR-NEXT: v_readlane_b32 s63, v40, 29 -; FLATSCR-NEXT: v_readlane_b32 s62, v40, 28 -; FLATSCR-NEXT: v_readlane_b32 s61, v40, 27 -; FLATSCR-NEXT: v_readlane_b32 s60, v40, 26 -; FLATSCR-NEXT: v_readlane_b32 s59, v40, 25 -; FLATSCR-NEXT: v_readlane_b32 s58, v40, 24 -; FLATSCR-NEXT: v_readlane_b32 s57, v40, 23 -; FLATSCR-NEXT: v_readlane_b32 s56, v40, 22 -; FLATSCR-NEXT: v_readlane_b32 s55, v40, 21 -; FLATSCR-NEXT: v_readlane_b32 s54, v40, 20 -; FLATSCR-NEXT: v_readlane_b32 s53, v40, 19 -; FLATSCR-NEXT: v_readlane_b32 s52, v40, 18 -; FLATSCR-NEXT: v_readlane_b32 s51, v40, 17 -; FLATSCR-NEXT: v_readlane_b32 s50, v40, 16 -; FLATSCR-NEXT: v_readlane_b32 s49, v40, 15 -; FLATSCR-NEXT: v_readlane_b32 s48, v40, 14 -; FLATSCR-NEXT: v_readlane_b32 s47, v40, 13 -; FLATSCR-NEXT: v_readlane_b32 s46, v40, 12 -; FLATSCR-NEXT: v_readlane_b32 s45, v40, 11 -; FLATSCR-NEXT: v_readlane_b32 s44, v40, 10 -; FLATSCR-NEXT: v_readlane_b32 s43, v40, 9 -; FLATSCR-NEXT: v_readlane_b32 s42, v40, 8 -; FLATSCR-NEXT: v_readlane_b32 s41, v40, 7 -; FLATSCR-NEXT: v_readlane_b32 s40, v40, 6 -; FLATSCR-NEXT: v_readlane_b32 s39, v40, 5 -; FLATSCR-NEXT: v_readlane_b32 s38, v40, 4 -; FLATSCR-NEXT: v_readlane_b32 s37, v40, 3 -; FLATSCR-NEXT: v_readlane_b32 s36, v40, 2 -; FLATSCR-NEXT: v_readlane_b32 s35, v40, 1 -; FLATSCR-NEXT: v_readlane_b32 s34, v40, 0 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 17 +; FLATSCR-NEXT: v_readlane_b32 s67, v40, 15 +; FLATSCR-NEXT: v_readlane_b32 s66, v40, 14 +; FLATSCR-NEXT: v_readlane_b32 s65, v40, 13 +; FLATSCR-NEXT: v_readlane_b32 s64, v40, 12 +; FLATSCR-NEXT: v_readlane_b32 s55, v40, 11 +; FLATSCR-NEXT: v_readlane_b32 s54, v40, 10 +; FLATSCR-NEXT: v_readlane_b32 s53, v40, 9 +; FLATSCR-NEXT: v_readlane_b32 s52, v40, 8 +; FLATSCR-NEXT: v_readlane_b32 s51, v40, 7 +; FLATSCR-NEXT: v_readlane_b32 s50, v40, 6 +; FLATSCR-NEXT: v_readlane_b32 s49, v40, 5 +; FLATSCR-NEXT: v_readlane_b32 s48, v40, 4 +; FLATSCR-NEXT: v_readlane_b32 s39, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s38, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s37, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s36, v40, 0 ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] @@ -571,39 +471,13 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; Has no spilled CSR VGPRs used for SGPR spilling, so no need to ; enable all lanes and restore. define void @spill_only_csr_sgpr() { -; MUBUF-LABEL: spill_only_csr_sgpr: -; MUBUF: ; %bb.0: -; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; MUBUF-NEXT: s_mov_b64 exec, s[4:5] -; MUBUF-NEXT: v_writelane_b32 v0, s42, 0 -; MUBUF-NEXT: ;;#ASMSTART -; MUBUF-NEXT: ; clobber s42 -; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_readlane_b32 s42, v0, 0 -; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; MUBUF-NEXT: s_mov_b64 exec, s[4:5] -; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_setpc_b64 s[30:31] -; -; FLATSCR-LABEL: spill_only_csr_sgpr: -; FLATSCR: ; %bb.0: -; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill -; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: v_writelane_b32 v0, s42, 0 -; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; clobber s42 -; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s42, v0, 0 -; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: spill_only_csr_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber s42 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber s42", "~{s42}"() ret void } @@ -665,69 +539,37 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; MUBUF-NEXT: v_writelane_b32 v1, s40, 0 -; MUBUF-NEXT: v_writelane_b32 v1, s41, 1 -; MUBUF-NEXT: v_writelane_b32 v1, s42, 2 -; MUBUF-NEXT: v_writelane_b32 v1, s43, 3 -; MUBUF-NEXT: v_writelane_b32 v1, s44, 4 -; MUBUF-NEXT: v_writelane_b32 v1, s45, 5 -; MUBUF-NEXT: v_writelane_b32 v1, s46, 6 -; MUBUF-NEXT: v_writelane_b32 v1, s47, 7 -; MUBUF-NEXT: v_writelane_b32 v1, s48, 8 -; MUBUF-NEXT: v_writelane_b32 v1, s49, 9 -; MUBUF-NEXT: v_writelane_b32 v1, s50, 10 -; MUBUF-NEXT: v_writelane_b32 v1, s51, 11 -; MUBUF-NEXT: v_writelane_b32 v1, s52, 12 -; MUBUF-NEXT: v_writelane_b32 v1, s53, 13 -; MUBUF-NEXT: v_writelane_b32 v1, s54, 14 -; MUBUF-NEXT: v_writelane_b32 v1, s55, 15 -; MUBUF-NEXT: v_writelane_b32 v1, s56, 16 -; MUBUF-NEXT: v_writelane_b32 v1, s57, 17 -; MUBUF-NEXT: v_writelane_b32 v1, s58, 18 -; MUBUF-NEXT: v_writelane_b32 v1, s59, 19 -; MUBUF-NEXT: v_writelane_b32 v1, s60, 20 -; MUBUF-NEXT: v_writelane_b32 v1, s61, 21 -; MUBUF-NEXT: v_writelane_b32 v1, s62, 22 -; MUBUF-NEXT: v_writelane_b32 v1, s63, 23 -; MUBUF-NEXT: v_writelane_b32 v1, s64, 24 -; MUBUF-NEXT: v_writelane_b32 v1, s65, 25 -; MUBUF-NEXT: v_writelane_b32 v1, s66, 26 -; MUBUF-NEXT: v_writelane_b32 v1, s67, 27 -; MUBUF-NEXT: v_writelane_b32 v1, s68, 28 -; MUBUF-NEXT: v_writelane_b32 v1, s69, 29 -; MUBUF-NEXT: v_writelane_b32 v1, s70, 30 -; MUBUF-NEXT: v_writelane_b32 v1, s71, 31 -; MUBUF-NEXT: v_writelane_b32 v1, s72, 32 -; MUBUF-NEXT: v_writelane_b32 v1, s73, 33 -; MUBUF-NEXT: v_writelane_b32 v1, s74, 34 -; MUBUF-NEXT: v_writelane_b32 v1, s75, 35 -; MUBUF-NEXT: v_writelane_b32 v1, s76, 36 -; MUBUF-NEXT: v_writelane_b32 v1, s77, 37 -; MUBUF-NEXT: v_writelane_b32 v1, s78, 38 -; MUBUF-NEXT: v_writelane_b32 v1, s79, 39 -; MUBUF-NEXT: v_writelane_b32 v1, s80, 40 -; MUBUF-NEXT: v_writelane_b32 v1, s81, 41 -; MUBUF-NEXT: v_writelane_b32 v1, s82, 42 -; MUBUF-NEXT: v_writelane_b32 v1, s83, 43 -; MUBUF-NEXT: v_writelane_b32 v1, s84, 44 -; MUBUF-NEXT: v_writelane_b32 v1, s85, 45 -; MUBUF-NEXT: v_writelane_b32 v1, s86, 46 -; MUBUF-NEXT: v_writelane_b32 v1, s87, 47 -; MUBUF-NEXT: v_writelane_b32 v1, s88, 48 -; MUBUF-NEXT: v_writelane_b32 v1, s89, 49 -; MUBUF-NEXT: v_writelane_b32 v1, s90, 50 -; MUBUF-NEXT: v_writelane_b32 v1, s91, 51 -; MUBUF-NEXT: v_writelane_b32 v1, s92, 52 -; MUBUF-NEXT: v_writelane_b32 v1, s93, 53 -; MUBUF-NEXT: v_writelane_b32 v1, s94, 54 -; MUBUF-NEXT: v_writelane_b32 v1, s95, 55 -; MUBUF-NEXT: v_writelane_b32 v1, s96, 56 -; MUBUF-NEXT: v_writelane_b32 v1, s97, 57 -; MUBUF-NEXT: v_writelane_b32 v1, s98, 58 -; MUBUF-NEXT: v_writelane_b32 v1, s99, 59 -; MUBUF-NEXT: v_writelane_b32 v1, s100, 60 -; MUBUF-NEXT: v_writelane_b32 v1, s101, 61 -; MUBUF-NEXT: v_writelane_b32 v1, s102, 62 +; MUBUF-NEXT: v_writelane_b32 v1, s48, 0 +; MUBUF-NEXT: v_writelane_b32 v1, s49, 1 +; MUBUF-NEXT: v_writelane_b32 v1, s50, 2 +; MUBUF-NEXT: v_writelane_b32 v1, s51, 3 +; MUBUF-NEXT: v_writelane_b32 v1, s52, 4 +; MUBUF-NEXT: v_writelane_b32 v1, s53, 5 +; MUBUF-NEXT: v_writelane_b32 v1, s54, 6 +; MUBUF-NEXT: v_writelane_b32 v1, s55, 7 +; MUBUF-NEXT: v_writelane_b32 v1, s64, 8 +; MUBUF-NEXT: v_writelane_b32 v1, s65, 9 +; MUBUF-NEXT: v_writelane_b32 v1, s66, 10 +; MUBUF-NEXT: v_writelane_b32 v1, s67, 11 +; MUBUF-NEXT: v_writelane_b32 v1, s68, 12 +; MUBUF-NEXT: v_writelane_b32 v1, s69, 13 +; MUBUF-NEXT: v_writelane_b32 v1, s70, 14 +; MUBUF-NEXT: v_writelane_b32 v1, s71, 15 +; MUBUF-NEXT: v_writelane_b32 v1, s80, 16 +; MUBUF-NEXT: v_writelane_b32 v1, s81, 17 +; MUBUF-NEXT: v_writelane_b32 v1, s82, 18 +; MUBUF-NEXT: v_writelane_b32 v1, s83, 19 +; MUBUF-NEXT: v_writelane_b32 v1, s84, 20 +; MUBUF-NEXT: v_writelane_b32 v1, s85, 21 +; MUBUF-NEXT: v_writelane_b32 v1, s86, 22 +; MUBUF-NEXT: v_writelane_b32 v1, s87, 23 +; MUBUF-NEXT: v_writelane_b32 v1, s96, 24 +; MUBUF-NEXT: v_writelane_b32 v1, s97, 25 +; MUBUF-NEXT: v_writelane_b32 v1, s98, 26 +; MUBUF-NEXT: v_writelane_b32 v1, s99, 27 +; MUBUF-NEXT: v_writelane_b32 v1, s100, 28 +; MUBUF-NEXT: v_writelane_b32 v1, s101, 29 +; MUBUF-NEXT: v_writelane_b32 v1, s102, 30 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -737,69 +579,37 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; MUBUF-NEXT: v_readlane_b32 s102, v1, 62 -; MUBUF-NEXT: v_readlane_b32 s101, v1, 61 -; MUBUF-NEXT: v_readlane_b32 s100, v1, 60 -; MUBUF-NEXT: v_readlane_b32 s99, v1, 59 -; MUBUF-NEXT: v_readlane_b32 s98, v1, 58 -; MUBUF-NEXT: v_readlane_b32 s97, v1, 57 -; MUBUF-NEXT: v_readlane_b32 s96, v1, 56 -; MUBUF-NEXT: v_readlane_b32 s95, v1, 55 -; MUBUF-NEXT: v_readlane_b32 s94, v1, 54 -; MUBUF-NEXT: v_readlane_b32 s93, v1, 53 -; MUBUF-NEXT: v_readlane_b32 s92, v1, 52 -; MUBUF-NEXT: v_readlane_b32 s91, v1, 51 -; MUBUF-NEXT: v_readlane_b32 s90, v1, 50 -; MUBUF-NEXT: v_readlane_b32 s89, v1, 49 -; MUBUF-NEXT: v_readlane_b32 s88, v1, 48 -; MUBUF-NEXT: v_readlane_b32 s87, v1, 47 -; MUBUF-NEXT: v_readlane_b32 s86, v1, 46 -; MUBUF-NEXT: v_readlane_b32 s85, v1, 45 -; MUBUF-NEXT: v_readlane_b32 s84, v1, 44 -; MUBUF-NEXT: v_readlane_b32 s83, v1, 43 -; MUBUF-NEXT: v_readlane_b32 s82, v1, 42 -; MUBUF-NEXT: v_readlane_b32 s81, v1, 41 -; MUBUF-NEXT: v_readlane_b32 s80, v1, 40 -; MUBUF-NEXT: v_readlane_b32 s79, v1, 39 -; MUBUF-NEXT: v_readlane_b32 s78, v1, 38 -; MUBUF-NEXT: v_readlane_b32 s77, v1, 37 -; MUBUF-NEXT: v_readlane_b32 s76, v1, 36 -; MUBUF-NEXT: v_readlane_b32 s75, v1, 35 -; MUBUF-NEXT: v_readlane_b32 s74, v1, 34 -; MUBUF-NEXT: v_readlane_b32 s73, v1, 33 -; MUBUF-NEXT: v_readlane_b32 s72, v1, 32 -; MUBUF-NEXT: v_readlane_b32 s71, v1, 31 -; MUBUF-NEXT: v_readlane_b32 s70, v1, 30 -; MUBUF-NEXT: v_readlane_b32 s69, v1, 29 -; MUBUF-NEXT: v_readlane_b32 s68, v1, 28 -; MUBUF-NEXT: v_readlane_b32 s67, v1, 27 -; MUBUF-NEXT: v_readlane_b32 s66, v1, 26 -; MUBUF-NEXT: v_readlane_b32 s65, v1, 25 -; MUBUF-NEXT: v_readlane_b32 s64, v1, 24 -; MUBUF-NEXT: v_readlane_b32 s63, v1, 23 -; MUBUF-NEXT: v_readlane_b32 s62, v1, 22 -; MUBUF-NEXT: v_readlane_b32 s61, v1, 21 -; MUBUF-NEXT: v_readlane_b32 s60, v1, 20 -; MUBUF-NEXT: v_readlane_b32 s59, v1, 19 -; MUBUF-NEXT: v_readlane_b32 s58, v1, 18 -; MUBUF-NEXT: v_readlane_b32 s57, v1, 17 -; MUBUF-NEXT: v_readlane_b32 s56, v1, 16 -; MUBUF-NEXT: v_readlane_b32 s55, v1, 15 -; MUBUF-NEXT: v_readlane_b32 s54, v1, 14 -; MUBUF-NEXT: v_readlane_b32 s53, v1, 13 -; MUBUF-NEXT: v_readlane_b32 s52, v1, 12 -; MUBUF-NEXT: v_readlane_b32 s51, v1, 11 -; MUBUF-NEXT: v_readlane_b32 s50, v1, 10 -; MUBUF-NEXT: v_readlane_b32 s49, v1, 9 -; MUBUF-NEXT: v_readlane_b32 s48, v1, 8 -; MUBUF-NEXT: v_readlane_b32 s47, v1, 7 -; MUBUF-NEXT: v_readlane_b32 s46, v1, 6 -; MUBUF-NEXT: v_readlane_b32 s45, v1, 5 -; MUBUF-NEXT: v_readlane_b32 s44, v1, 4 -; MUBUF-NEXT: v_readlane_b32 s43, v1, 3 -; MUBUF-NEXT: v_readlane_b32 s42, v1, 2 -; MUBUF-NEXT: v_readlane_b32 s41, v1, 1 -; MUBUF-NEXT: v_readlane_b32 s40, v1, 0 +; MUBUF-NEXT: v_readlane_b32 s102, v1, 30 +; MUBUF-NEXT: v_readlane_b32 s101, v1, 29 +; MUBUF-NEXT: v_readlane_b32 s100, v1, 28 +; MUBUF-NEXT: v_readlane_b32 s99, v1, 27 +; MUBUF-NEXT: v_readlane_b32 s98, v1, 26 +; MUBUF-NEXT: v_readlane_b32 s97, v1, 25 +; MUBUF-NEXT: v_readlane_b32 s96, v1, 24 +; MUBUF-NEXT: v_readlane_b32 s87, v1, 23 +; MUBUF-NEXT: v_readlane_b32 s86, v1, 22 +; MUBUF-NEXT: v_readlane_b32 s85, v1, 21 +; MUBUF-NEXT: v_readlane_b32 s84, v1, 20 +; MUBUF-NEXT: v_readlane_b32 s83, v1, 19 +; MUBUF-NEXT: v_readlane_b32 s82, v1, 18 +; MUBUF-NEXT: v_readlane_b32 s81, v1, 17 +; MUBUF-NEXT: v_readlane_b32 s80, v1, 16 +; MUBUF-NEXT: v_readlane_b32 s71, v1, 15 +; MUBUF-NEXT: v_readlane_b32 s70, v1, 14 +; MUBUF-NEXT: v_readlane_b32 s69, v1, 13 +; MUBUF-NEXT: v_readlane_b32 s68, v1, 12 +; MUBUF-NEXT: v_readlane_b32 s67, v1, 11 +; MUBUF-NEXT: v_readlane_b32 s66, v1, 10 +; MUBUF-NEXT: v_readlane_b32 s65, v1, 9 +; MUBUF-NEXT: v_readlane_b32 s64, v1, 8 +; MUBUF-NEXT: v_readlane_b32 s55, v1, 7 +; MUBUF-NEXT: v_readlane_b32 s54, v1, 6 +; MUBUF-NEXT: v_readlane_b32 s53, v1, 5 +; MUBUF-NEXT: v_readlane_b32 s52, v1, 4 +; MUBUF-NEXT: v_readlane_b32 s51, v1, 3 +; MUBUF-NEXT: v_readlane_b32 s50, v1, 2 +; MUBUF-NEXT: v_readlane_b32 s49, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s48, v1, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -818,69 +628,37 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; FLATSCR-NEXT: v_writelane_b32 v1, s40, 0 -; FLATSCR-NEXT: v_writelane_b32 v1, s41, 1 -; FLATSCR-NEXT: v_writelane_b32 v1, s42, 2 -; FLATSCR-NEXT: v_writelane_b32 v1, s43, 3 -; FLATSCR-NEXT: v_writelane_b32 v1, s44, 4 -; FLATSCR-NEXT: v_writelane_b32 v1, s45, 5 -; FLATSCR-NEXT: v_writelane_b32 v1, s46, 6 -; FLATSCR-NEXT: v_writelane_b32 v1, s47, 7 -; FLATSCR-NEXT: v_writelane_b32 v1, s48, 8 -; FLATSCR-NEXT: v_writelane_b32 v1, s49, 9 -; FLATSCR-NEXT: v_writelane_b32 v1, s50, 10 -; FLATSCR-NEXT: v_writelane_b32 v1, s51, 11 -; FLATSCR-NEXT: v_writelane_b32 v1, s52, 12 -; FLATSCR-NEXT: v_writelane_b32 v1, s53, 13 -; FLATSCR-NEXT: v_writelane_b32 v1, s54, 14 -; FLATSCR-NEXT: v_writelane_b32 v1, s55, 15 -; FLATSCR-NEXT: v_writelane_b32 v1, s56, 16 -; FLATSCR-NEXT: v_writelane_b32 v1, s57, 17 -; FLATSCR-NEXT: v_writelane_b32 v1, s58, 18 -; FLATSCR-NEXT: v_writelane_b32 v1, s59, 19 -; FLATSCR-NEXT: v_writelane_b32 v1, s60, 20 -; FLATSCR-NEXT: v_writelane_b32 v1, s61, 21 -; FLATSCR-NEXT: v_writelane_b32 v1, s62, 22 -; FLATSCR-NEXT: v_writelane_b32 v1, s63, 23 -; FLATSCR-NEXT: v_writelane_b32 v1, s64, 24 -; FLATSCR-NEXT: v_writelane_b32 v1, s65, 25 -; FLATSCR-NEXT: v_writelane_b32 v1, s66, 26 -; FLATSCR-NEXT: v_writelane_b32 v1, s67, 27 -; FLATSCR-NEXT: v_writelane_b32 v1, s68, 28 -; FLATSCR-NEXT: v_writelane_b32 v1, s69, 29 -; FLATSCR-NEXT: v_writelane_b32 v1, s70, 30 -; FLATSCR-NEXT: v_writelane_b32 v1, s71, 31 -; FLATSCR-NEXT: v_writelane_b32 v1, s72, 32 -; FLATSCR-NEXT: v_writelane_b32 v1, s73, 33 -; FLATSCR-NEXT: v_writelane_b32 v1, s74, 34 -; FLATSCR-NEXT: v_writelane_b32 v1, s75, 35 -; FLATSCR-NEXT: v_writelane_b32 v1, s76, 36 -; FLATSCR-NEXT: v_writelane_b32 v1, s77, 37 -; FLATSCR-NEXT: v_writelane_b32 v1, s78, 38 -; FLATSCR-NEXT: v_writelane_b32 v1, s79, 39 -; FLATSCR-NEXT: v_writelane_b32 v1, s80, 40 -; FLATSCR-NEXT: v_writelane_b32 v1, s81, 41 -; FLATSCR-NEXT: v_writelane_b32 v1, s82, 42 -; FLATSCR-NEXT: v_writelane_b32 v1, s83, 43 -; FLATSCR-NEXT: v_writelane_b32 v1, s84, 44 -; FLATSCR-NEXT: v_writelane_b32 v1, s85, 45 -; FLATSCR-NEXT: v_writelane_b32 v1, s86, 46 -; FLATSCR-NEXT: v_writelane_b32 v1, s87, 47 -; FLATSCR-NEXT: v_writelane_b32 v1, s88, 48 -; FLATSCR-NEXT: v_writelane_b32 v1, s89, 49 -; FLATSCR-NEXT: v_writelane_b32 v1, s90, 50 -; FLATSCR-NEXT: v_writelane_b32 v1, s91, 51 -; FLATSCR-NEXT: v_writelane_b32 v1, s92, 52 -; FLATSCR-NEXT: v_writelane_b32 v1, s93, 53 -; FLATSCR-NEXT: v_writelane_b32 v1, s94, 54 -; FLATSCR-NEXT: v_writelane_b32 v1, s95, 55 -; FLATSCR-NEXT: v_writelane_b32 v1, s96, 56 -; FLATSCR-NEXT: v_writelane_b32 v1, s97, 57 -; FLATSCR-NEXT: v_writelane_b32 v1, s98, 58 -; FLATSCR-NEXT: v_writelane_b32 v1, s99, 59 -; FLATSCR-NEXT: v_writelane_b32 v1, s100, 60 -; FLATSCR-NEXT: v_writelane_b32 v1, s101, 61 -; FLATSCR-NEXT: v_writelane_b32 v1, s102, 62 +; FLATSCR-NEXT: v_writelane_b32 v1, s48, 0 +; FLATSCR-NEXT: v_writelane_b32 v1, s49, 1 +; FLATSCR-NEXT: v_writelane_b32 v1, s50, 2 +; FLATSCR-NEXT: v_writelane_b32 v1, s51, 3 +; FLATSCR-NEXT: v_writelane_b32 v1, s52, 4 +; FLATSCR-NEXT: v_writelane_b32 v1, s53, 5 +; FLATSCR-NEXT: v_writelane_b32 v1, s54, 6 +; FLATSCR-NEXT: v_writelane_b32 v1, s55, 7 +; FLATSCR-NEXT: v_writelane_b32 v1, s64, 8 +; FLATSCR-NEXT: v_writelane_b32 v1, s65, 9 +; FLATSCR-NEXT: v_writelane_b32 v1, s66, 10 +; FLATSCR-NEXT: v_writelane_b32 v1, s67, 11 +; FLATSCR-NEXT: v_writelane_b32 v1, s68, 12 +; FLATSCR-NEXT: v_writelane_b32 v1, s69, 13 +; FLATSCR-NEXT: v_writelane_b32 v1, s70, 14 +; FLATSCR-NEXT: v_writelane_b32 v1, s71, 15 +; FLATSCR-NEXT: v_writelane_b32 v1, s80, 16 +; FLATSCR-NEXT: v_writelane_b32 v1, s81, 17 +; FLATSCR-NEXT: v_writelane_b32 v1, s82, 18 +; FLATSCR-NEXT: v_writelane_b32 v1, s83, 19 +; FLATSCR-NEXT: v_writelane_b32 v1, s84, 20 +; FLATSCR-NEXT: v_writelane_b32 v1, s85, 21 +; FLATSCR-NEXT: v_writelane_b32 v1, s86, 22 +; FLATSCR-NEXT: v_writelane_b32 v1, s87, 23 +; FLATSCR-NEXT: v_writelane_b32 v1, s96, 24 +; FLATSCR-NEXT: v_writelane_b32 v1, s97, 25 +; FLATSCR-NEXT: v_writelane_b32 v1, s98, 26 +; FLATSCR-NEXT: v_writelane_b32 v1, s99, 27 +; FLATSCR-NEXT: v_writelane_b32 v1, s100, 28 +; FLATSCR-NEXT: v_writelane_b32 v1, s101, 29 +; FLATSCR-NEXT: v_writelane_b32 v1, s102, 30 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -890,69 +668,37 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload -; FLATSCR-NEXT: v_readlane_b32 s102, v1, 62 -; FLATSCR-NEXT: v_readlane_b32 s101, v1, 61 -; FLATSCR-NEXT: v_readlane_b32 s100, v1, 60 -; FLATSCR-NEXT: v_readlane_b32 s99, v1, 59 -; FLATSCR-NEXT: v_readlane_b32 s98, v1, 58 -; FLATSCR-NEXT: v_readlane_b32 s97, v1, 57 -; FLATSCR-NEXT: v_readlane_b32 s96, v1, 56 -; FLATSCR-NEXT: v_readlane_b32 s95, v1, 55 -; FLATSCR-NEXT: v_readlane_b32 s94, v1, 54 -; FLATSCR-NEXT: v_readlane_b32 s93, v1, 53 -; FLATSCR-NEXT: v_readlane_b32 s92, v1, 52 -; FLATSCR-NEXT: v_readlane_b32 s91, v1, 51 -; FLATSCR-NEXT: v_readlane_b32 s90, v1, 50 -; FLATSCR-NEXT: v_readlane_b32 s89, v1, 49 -; FLATSCR-NEXT: v_readlane_b32 s88, v1, 48 -; FLATSCR-NEXT: v_readlane_b32 s87, v1, 47 -; FLATSCR-NEXT: v_readlane_b32 s86, v1, 46 -; FLATSCR-NEXT: v_readlane_b32 s85, v1, 45 -; FLATSCR-NEXT: v_readlane_b32 s84, v1, 44 -; FLATSCR-NEXT: v_readlane_b32 s83, v1, 43 -; FLATSCR-NEXT: v_readlane_b32 s82, v1, 42 -; FLATSCR-NEXT: v_readlane_b32 s81, v1, 41 -; FLATSCR-NEXT: v_readlane_b32 s80, v1, 40 -; FLATSCR-NEXT: v_readlane_b32 s79, v1, 39 -; FLATSCR-NEXT: v_readlane_b32 s78, v1, 38 -; FLATSCR-NEXT: v_readlane_b32 s77, v1, 37 -; FLATSCR-NEXT: v_readlane_b32 s76, v1, 36 -; FLATSCR-NEXT: v_readlane_b32 s75, v1, 35 -; FLATSCR-NEXT: v_readlane_b32 s74, v1, 34 -; FLATSCR-NEXT: v_readlane_b32 s73, v1, 33 -; FLATSCR-NEXT: v_readlane_b32 s72, v1, 32 -; FLATSCR-NEXT: v_readlane_b32 s71, v1, 31 -; FLATSCR-NEXT: v_readlane_b32 s70, v1, 30 -; FLATSCR-NEXT: v_readlane_b32 s69, v1, 29 -; FLATSCR-NEXT: v_readlane_b32 s68, v1, 28 -; FLATSCR-NEXT: v_readlane_b32 s67, v1, 27 -; FLATSCR-NEXT: v_readlane_b32 s66, v1, 26 -; FLATSCR-NEXT: v_readlane_b32 s65, v1, 25 -; FLATSCR-NEXT: v_readlane_b32 s64, v1, 24 -; FLATSCR-NEXT: v_readlane_b32 s63, v1, 23 -; FLATSCR-NEXT: v_readlane_b32 s62, v1, 22 -; FLATSCR-NEXT: v_readlane_b32 s61, v1, 21 -; FLATSCR-NEXT: v_readlane_b32 s60, v1, 20 -; FLATSCR-NEXT: v_readlane_b32 s59, v1, 19 -; FLATSCR-NEXT: v_readlane_b32 s58, v1, 18 -; FLATSCR-NEXT: v_readlane_b32 s57, v1, 17 -; FLATSCR-NEXT: v_readlane_b32 s56, v1, 16 -; FLATSCR-NEXT: v_readlane_b32 s55, v1, 15 -; FLATSCR-NEXT: v_readlane_b32 s54, v1, 14 -; FLATSCR-NEXT: v_readlane_b32 s53, v1, 13 -; FLATSCR-NEXT: v_readlane_b32 s52, v1, 12 -; FLATSCR-NEXT: v_readlane_b32 s51, v1, 11 -; FLATSCR-NEXT: v_readlane_b32 s50, v1, 10 -; FLATSCR-NEXT: v_readlane_b32 s49, v1, 9 -; FLATSCR-NEXT: v_readlane_b32 s48, v1, 8 -; FLATSCR-NEXT: v_readlane_b32 s47, v1, 7 -; FLATSCR-NEXT: v_readlane_b32 s46, v1, 6 -; FLATSCR-NEXT: v_readlane_b32 s45, v1, 5 -; FLATSCR-NEXT: v_readlane_b32 s44, v1, 4 -; FLATSCR-NEXT: v_readlane_b32 s43, v1, 3 -; FLATSCR-NEXT: v_readlane_b32 s42, v1, 2 -; FLATSCR-NEXT: v_readlane_b32 s41, v1, 1 -; FLATSCR-NEXT: v_readlane_b32 s40, v1, 0 +; FLATSCR-NEXT: v_readlane_b32 s102, v1, 30 +; FLATSCR-NEXT: v_readlane_b32 s101, v1, 29 +; FLATSCR-NEXT: v_readlane_b32 s100, v1, 28 +; FLATSCR-NEXT: v_readlane_b32 s99, v1, 27 +; FLATSCR-NEXT: v_readlane_b32 s98, v1, 26 +; FLATSCR-NEXT: v_readlane_b32 s97, v1, 25 +; FLATSCR-NEXT: v_readlane_b32 s96, v1, 24 +; FLATSCR-NEXT: v_readlane_b32 s87, v1, 23 +; FLATSCR-NEXT: v_readlane_b32 s86, v1, 22 +; FLATSCR-NEXT: v_readlane_b32 s85, v1, 21 +; FLATSCR-NEXT: v_readlane_b32 s84, v1, 20 +; FLATSCR-NEXT: v_readlane_b32 s83, v1, 19 +; FLATSCR-NEXT: v_readlane_b32 s82, v1, 18 +; FLATSCR-NEXT: v_readlane_b32 s81, v1, 17 +; FLATSCR-NEXT: v_readlane_b32 s80, v1, 16 +; FLATSCR-NEXT: v_readlane_b32 s71, v1, 15 +; FLATSCR-NEXT: v_readlane_b32 s70, v1, 14 +; FLATSCR-NEXT: v_readlane_b32 s69, v1, 13 +; FLATSCR-NEXT: v_readlane_b32 s68, v1, 12 +; FLATSCR-NEXT: v_readlane_b32 s67, v1, 11 +; FLATSCR-NEXT: v_readlane_b32 s66, v1, 10 +; FLATSCR-NEXT: v_readlane_b32 s65, v1, 9 +; FLATSCR-NEXT: v_readlane_b32 s64, v1, 8 +; FLATSCR-NEXT: v_readlane_b32 s55, v1, 7 +; FLATSCR-NEXT: v_readlane_b32 s54, v1, 6 +; FLATSCR-NEXT: v_readlane_b32 s53, v1, 5 +; FLATSCR-NEXT: v_readlane_b32 s52, v1, 4 +; FLATSCR-NEXT: v_readlane_b32 s51, v1, 3 +; FLATSCR-NEXT: v_readlane_b32 s50, v1, 2 +; FLATSCR-NEXT: v_readlane_b32 s49, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s48, v1, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:8 ; 4-byte Folded Reload @@ -988,69 +734,37 @@ define void @no_new_vgpr_for_fp_csr() #1 { ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; MUBUF-NEXT: v_writelane_b32 v1, s39, 0 -; MUBUF-NEXT: v_writelane_b32 v1, s40, 1 -; MUBUF-NEXT: v_writelane_b32 v1, s41, 2 -; MUBUF-NEXT: v_writelane_b32 v1, s42, 3 -; MUBUF-NEXT: v_writelane_b32 v1, s43, 4 -; MUBUF-NEXT: v_writelane_b32 v1, s44, 5 -; MUBUF-NEXT: v_writelane_b32 v1, s45, 6 -; MUBUF-NEXT: v_writelane_b32 v1, s46, 7 -; MUBUF-NEXT: v_writelane_b32 v1, s47, 8 -; MUBUF-NEXT: v_writelane_b32 v1, s48, 9 -; MUBUF-NEXT: v_writelane_b32 v1, s49, 10 -; MUBUF-NEXT: v_writelane_b32 v1, s50, 11 -; MUBUF-NEXT: v_writelane_b32 v1, s51, 12 -; MUBUF-NEXT: v_writelane_b32 v1, s52, 13 -; MUBUF-NEXT: v_writelane_b32 v1, s53, 14 -; MUBUF-NEXT: v_writelane_b32 v1, s54, 15 -; MUBUF-NEXT: v_writelane_b32 v1, s55, 16 -; MUBUF-NEXT: v_writelane_b32 v1, s56, 17 -; MUBUF-NEXT: v_writelane_b32 v1, s57, 18 -; MUBUF-NEXT: v_writelane_b32 v1, s58, 19 -; MUBUF-NEXT: v_writelane_b32 v1, s59, 20 -; MUBUF-NEXT: v_writelane_b32 v1, s60, 21 -; MUBUF-NEXT: v_writelane_b32 v1, s61, 22 -; MUBUF-NEXT: v_writelane_b32 v1, s62, 23 -; MUBUF-NEXT: v_writelane_b32 v1, s63, 24 -; MUBUF-NEXT: v_writelane_b32 v1, s64, 25 -; MUBUF-NEXT: v_writelane_b32 v1, s65, 26 -; MUBUF-NEXT: v_writelane_b32 v1, s66, 27 -; MUBUF-NEXT: v_writelane_b32 v1, s67, 28 -; MUBUF-NEXT: v_writelane_b32 v1, s68, 29 -; MUBUF-NEXT: v_writelane_b32 v1, s69, 30 -; MUBUF-NEXT: v_writelane_b32 v1, s70, 31 -; MUBUF-NEXT: v_writelane_b32 v1, s71, 32 -; MUBUF-NEXT: v_writelane_b32 v1, s72, 33 -; MUBUF-NEXT: v_writelane_b32 v1, s73, 34 -; MUBUF-NEXT: v_writelane_b32 v1, s74, 35 -; MUBUF-NEXT: v_writelane_b32 v1, s75, 36 -; MUBUF-NEXT: v_writelane_b32 v1, s76, 37 -; MUBUF-NEXT: v_writelane_b32 v1, s77, 38 -; MUBUF-NEXT: v_writelane_b32 v1, s78, 39 -; MUBUF-NEXT: v_writelane_b32 v1, s79, 40 -; MUBUF-NEXT: v_writelane_b32 v1, s80, 41 -; MUBUF-NEXT: v_writelane_b32 v1, s81, 42 -; MUBUF-NEXT: v_writelane_b32 v1, s82, 43 -; MUBUF-NEXT: v_writelane_b32 v1, s83, 44 -; MUBUF-NEXT: v_writelane_b32 v1, s84, 45 -; MUBUF-NEXT: v_writelane_b32 v1, s85, 46 -; MUBUF-NEXT: v_writelane_b32 v1, s86, 47 -; MUBUF-NEXT: v_writelane_b32 v1, s87, 48 -; MUBUF-NEXT: v_writelane_b32 v1, s88, 49 -; MUBUF-NEXT: v_writelane_b32 v1, s89, 50 -; MUBUF-NEXT: v_writelane_b32 v1, s90, 51 -; MUBUF-NEXT: v_writelane_b32 v1, s91, 52 -; MUBUF-NEXT: v_writelane_b32 v1, s92, 53 -; MUBUF-NEXT: v_writelane_b32 v1, s93, 54 -; MUBUF-NEXT: v_writelane_b32 v1, s94, 55 -; MUBUF-NEXT: v_writelane_b32 v1, s95, 56 -; MUBUF-NEXT: v_writelane_b32 v1, s96, 57 -; MUBUF-NEXT: v_writelane_b32 v1, s97, 58 -; MUBUF-NEXT: v_writelane_b32 v1, s98, 59 -; MUBUF-NEXT: v_writelane_b32 v1, s99, 60 -; MUBUF-NEXT: v_writelane_b32 v1, s100, 61 -; MUBUF-NEXT: v_writelane_b32 v1, s101, 62 -; MUBUF-NEXT: v_writelane_b32 v1, s102, 63 +; MUBUF-NEXT: v_writelane_b32 v1, s48, 1 +; MUBUF-NEXT: v_writelane_b32 v1, s49, 2 +; MUBUF-NEXT: v_writelane_b32 v1, s50, 3 +; MUBUF-NEXT: v_writelane_b32 v1, s51, 4 +; MUBUF-NEXT: v_writelane_b32 v1, s52, 5 +; MUBUF-NEXT: v_writelane_b32 v1, s53, 6 +; MUBUF-NEXT: v_writelane_b32 v1, s54, 7 +; MUBUF-NEXT: v_writelane_b32 v1, s55, 8 +; MUBUF-NEXT: v_writelane_b32 v1, s64, 9 +; MUBUF-NEXT: v_writelane_b32 v1, s65, 10 +; MUBUF-NEXT: v_writelane_b32 v1, s66, 11 +; MUBUF-NEXT: v_writelane_b32 v1, s67, 12 +; MUBUF-NEXT: v_writelane_b32 v1, s68, 13 +; MUBUF-NEXT: v_writelane_b32 v1, s69, 14 +; MUBUF-NEXT: v_writelane_b32 v1, s70, 15 +; MUBUF-NEXT: v_writelane_b32 v1, s71, 16 +; MUBUF-NEXT: v_writelane_b32 v1, s80, 17 +; MUBUF-NEXT: v_writelane_b32 v1, s81, 18 +; MUBUF-NEXT: v_writelane_b32 v1, s82, 19 +; MUBUF-NEXT: v_writelane_b32 v1, s83, 20 +; MUBUF-NEXT: v_writelane_b32 v1, s84, 21 +; MUBUF-NEXT: v_writelane_b32 v1, s85, 22 +; MUBUF-NEXT: v_writelane_b32 v1, s86, 23 +; MUBUF-NEXT: v_writelane_b32 v1, s87, 24 +; MUBUF-NEXT: v_writelane_b32 v1, s96, 25 +; MUBUF-NEXT: v_writelane_b32 v1, s97, 26 +; MUBUF-NEXT: v_writelane_b32 v1, s98, 27 +; MUBUF-NEXT: v_writelane_b32 v1, s99, 28 +; MUBUF-NEXT: v_writelane_b32 v1, s100, 29 +; MUBUF-NEXT: v_writelane_b32 v1, s101, 30 +; MUBUF-NEXT: v_writelane_b32 v1, s102, 31 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -1060,69 +774,37 @@ define void @no_new_vgpr_for_fp_csr() #1 { ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; MUBUF-NEXT: v_readlane_b32 s102, v1, 63 -; MUBUF-NEXT: v_readlane_b32 s101, v1, 62 -; MUBUF-NEXT: v_readlane_b32 s100, v1, 61 -; MUBUF-NEXT: v_readlane_b32 s99, v1, 60 -; MUBUF-NEXT: v_readlane_b32 s98, v1, 59 -; MUBUF-NEXT: v_readlane_b32 s97, v1, 58 -; MUBUF-NEXT: v_readlane_b32 s96, v1, 57 -; MUBUF-NEXT: v_readlane_b32 s95, v1, 56 -; MUBUF-NEXT: v_readlane_b32 s94, v1, 55 -; MUBUF-NEXT: v_readlane_b32 s93, v1, 54 -; MUBUF-NEXT: v_readlane_b32 s92, v1, 53 -; MUBUF-NEXT: v_readlane_b32 s91, v1, 52 -; MUBUF-NEXT: v_readlane_b32 s90, v1, 51 -; MUBUF-NEXT: v_readlane_b32 s89, v1, 50 -; MUBUF-NEXT: v_readlane_b32 s88, v1, 49 -; MUBUF-NEXT: v_readlane_b32 s87, v1, 48 -; MUBUF-NEXT: v_readlane_b32 s86, v1, 47 -; MUBUF-NEXT: v_readlane_b32 s85, v1, 46 -; MUBUF-NEXT: v_readlane_b32 s84, v1, 45 -; MUBUF-NEXT: v_readlane_b32 s83, v1, 44 -; MUBUF-NEXT: v_readlane_b32 s82, v1, 43 -; MUBUF-NEXT: v_readlane_b32 s81, v1, 42 -; MUBUF-NEXT: v_readlane_b32 s80, v1, 41 -; MUBUF-NEXT: v_readlane_b32 s79, v1, 40 -; MUBUF-NEXT: v_readlane_b32 s78, v1, 39 -; MUBUF-NEXT: v_readlane_b32 s77, v1, 38 -; MUBUF-NEXT: v_readlane_b32 s76, v1, 37 -; MUBUF-NEXT: v_readlane_b32 s75, v1, 36 -; MUBUF-NEXT: v_readlane_b32 s74, v1, 35 -; MUBUF-NEXT: v_readlane_b32 s73, v1, 34 -; MUBUF-NEXT: v_readlane_b32 s72, v1, 33 -; MUBUF-NEXT: v_readlane_b32 s71, v1, 32 -; MUBUF-NEXT: v_readlane_b32 s70, v1, 31 -; MUBUF-NEXT: v_readlane_b32 s69, v1, 30 -; MUBUF-NEXT: v_readlane_b32 s68, v1, 29 -; MUBUF-NEXT: v_readlane_b32 s67, v1, 28 -; MUBUF-NEXT: v_readlane_b32 s66, v1, 27 -; MUBUF-NEXT: v_readlane_b32 s65, v1, 26 -; MUBUF-NEXT: v_readlane_b32 s64, v1, 25 -; MUBUF-NEXT: v_readlane_b32 s63, v1, 24 -; MUBUF-NEXT: v_readlane_b32 s62, v1, 23 -; MUBUF-NEXT: v_readlane_b32 s61, v1, 22 -; MUBUF-NEXT: v_readlane_b32 s60, v1, 21 -; MUBUF-NEXT: v_readlane_b32 s59, v1, 20 -; MUBUF-NEXT: v_readlane_b32 s58, v1, 19 -; MUBUF-NEXT: v_readlane_b32 s57, v1, 18 -; MUBUF-NEXT: v_readlane_b32 s56, v1, 17 -; MUBUF-NEXT: v_readlane_b32 s55, v1, 16 -; MUBUF-NEXT: v_readlane_b32 s54, v1, 15 -; MUBUF-NEXT: v_readlane_b32 s53, v1, 14 -; MUBUF-NEXT: v_readlane_b32 s52, v1, 13 -; MUBUF-NEXT: v_readlane_b32 s51, v1, 12 -; MUBUF-NEXT: v_readlane_b32 s50, v1, 11 -; MUBUF-NEXT: v_readlane_b32 s49, v1, 10 -; MUBUF-NEXT: v_readlane_b32 s48, v1, 9 -; MUBUF-NEXT: v_readlane_b32 s47, v1, 8 -; MUBUF-NEXT: v_readlane_b32 s46, v1, 7 -; MUBUF-NEXT: v_readlane_b32 s45, v1, 6 -; MUBUF-NEXT: v_readlane_b32 s44, v1, 5 -; MUBUF-NEXT: v_readlane_b32 s43, v1, 4 -; MUBUF-NEXT: v_readlane_b32 s42, v1, 3 -; MUBUF-NEXT: v_readlane_b32 s41, v1, 2 -; MUBUF-NEXT: v_readlane_b32 s40, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s102, v1, 31 +; MUBUF-NEXT: v_readlane_b32 s101, v1, 30 +; MUBUF-NEXT: v_readlane_b32 s100, v1, 29 +; MUBUF-NEXT: v_readlane_b32 s99, v1, 28 +; MUBUF-NEXT: v_readlane_b32 s98, v1, 27 +; MUBUF-NEXT: v_readlane_b32 s97, v1, 26 +; MUBUF-NEXT: v_readlane_b32 s96, v1, 25 +; MUBUF-NEXT: v_readlane_b32 s87, v1, 24 +; MUBUF-NEXT: v_readlane_b32 s86, v1, 23 +; MUBUF-NEXT: v_readlane_b32 s85, v1, 22 +; MUBUF-NEXT: v_readlane_b32 s84, v1, 21 +; MUBUF-NEXT: v_readlane_b32 s83, v1, 20 +; MUBUF-NEXT: v_readlane_b32 s82, v1, 19 +; MUBUF-NEXT: v_readlane_b32 s81, v1, 18 +; MUBUF-NEXT: v_readlane_b32 s80, v1, 17 +; MUBUF-NEXT: v_readlane_b32 s71, v1, 16 +; MUBUF-NEXT: v_readlane_b32 s70, v1, 15 +; MUBUF-NEXT: v_readlane_b32 s69, v1, 14 +; MUBUF-NEXT: v_readlane_b32 s68, v1, 13 +; MUBUF-NEXT: v_readlane_b32 s67, v1, 12 +; MUBUF-NEXT: v_readlane_b32 s66, v1, 11 +; MUBUF-NEXT: v_readlane_b32 s65, v1, 10 +; MUBUF-NEXT: v_readlane_b32 s64, v1, 9 +; MUBUF-NEXT: v_readlane_b32 s55, v1, 8 +; MUBUF-NEXT: v_readlane_b32 s54, v1, 7 +; MUBUF-NEXT: v_readlane_b32 s53, v1, 6 +; MUBUF-NEXT: v_readlane_b32 s52, v1, 5 +; MUBUF-NEXT: v_readlane_b32 s51, v1, 4 +; MUBUF-NEXT: v_readlane_b32 s50, v1, 3 +; MUBUF-NEXT: v_readlane_b32 s49, v1, 2 +; MUBUF-NEXT: v_readlane_b32 s48, v1, 1 ; MUBUF-NEXT: v_readlane_b32 s39, v1, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 @@ -1143,69 +825,37 @@ define void @no_new_vgpr_for_fp_csr() #1 { ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill ; FLATSCR-NEXT: v_writelane_b32 v1, s39, 0 -; FLATSCR-NEXT: v_writelane_b32 v1, s40, 1 -; FLATSCR-NEXT: v_writelane_b32 v1, s41, 2 -; FLATSCR-NEXT: v_writelane_b32 v1, s42, 3 -; FLATSCR-NEXT: v_writelane_b32 v1, s43, 4 -; FLATSCR-NEXT: v_writelane_b32 v1, s44, 5 -; FLATSCR-NEXT: v_writelane_b32 v1, s45, 6 -; FLATSCR-NEXT: v_writelane_b32 v1, s46, 7 -; FLATSCR-NEXT: v_writelane_b32 v1, s47, 8 -; FLATSCR-NEXT: v_writelane_b32 v1, s48, 9 -; FLATSCR-NEXT: v_writelane_b32 v1, s49, 10 -; FLATSCR-NEXT: v_writelane_b32 v1, s50, 11 -; FLATSCR-NEXT: v_writelane_b32 v1, s51, 12 -; FLATSCR-NEXT: v_writelane_b32 v1, s52, 13 -; FLATSCR-NEXT: v_writelane_b32 v1, s53, 14 -; FLATSCR-NEXT: v_writelane_b32 v1, s54, 15 -; FLATSCR-NEXT: v_writelane_b32 v1, s55, 16 -; FLATSCR-NEXT: v_writelane_b32 v1, s56, 17 -; FLATSCR-NEXT: v_writelane_b32 v1, s57, 18 -; FLATSCR-NEXT: v_writelane_b32 v1, s58, 19 -; FLATSCR-NEXT: v_writelane_b32 v1, s59, 20 -; FLATSCR-NEXT: v_writelane_b32 v1, s60, 21 -; FLATSCR-NEXT: v_writelane_b32 v1, s61, 22 -; FLATSCR-NEXT: v_writelane_b32 v1, s62, 23 -; FLATSCR-NEXT: v_writelane_b32 v1, s63, 24 -; FLATSCR-NEXT: v_writelane_b32 v1, s64, 25 -; FLATSCR-NEXT: v_writelane_b32 v1, s65, 26 -; FLATSCR-NEXT: v_writelane_b32 v1, s66, 27 -; FLATSCR-NEXT: v_writelane_b32 v1, s67, 28 -; FLATSCR-NEXT: v_writelane_b32 v1, s68, 29 -; FLATSCR-NEXT: v_writelane_b32 v1, s69, 30 -; FLATSCR-NEXT: v_writelane_b32 v1, s70, 31 -; FLATSCR-NEXT: v_writelane_b32 v1, s71, 32 -; FLATSCR-NEXT: v_writelane_b32 v1, s72, 33 -; FLATSCR-NEXT: v_writelane_b32 v1, s73, 34 -; FLATSCR-NEXT: v_writelane_b32 v1, s74, 35 -; FLATSCR-NEXT: v_writelane_b32 v1, s75, 36 -; FLATSCR-NEXT: v_writelane_b32 v1, s76, 37 -; FLATSCR-NEXT: v_writelane_b32 v1, s77, 38 -; FLATSCR-NEXT: v_writelane_b32 v1, s78, 39 -; FLATSCR-NEXT: v_writelane_b32 v1, s79, 40 -; FLATSCR-NEXT: v_writelane_b32 v1, s80, 41 -; FLATSCR-NEXT: v_writelane_b32 v1, s81, 42 -; FLATSCR-NEXT: v_writelane_b32 v1, s82, 43 -; FLATSCR-NEXT: v_writelane_b32 v1, s83, 44 -; FLATSCR-NEXT: v_writelane_b32 v1, s84, 45 -; FLATSCR-NEXT: v_writelane_b32 v1, s85, 46 -; FLATSCR-NEXT: v_writelane_b32 v1, s86, 47 -; FLATSCR-NEXT: v_writelane_b32 v1, s87, 48 -; FLATSCR-NEXT: v_writelane_b32 v1, s88, 49 -; FLATSCR-NEXT: v_writelane_b32 v1, s89, 50 -; FLATSCR-NEXT: v_writelane_b32 v1, s90, 51 -; FLATSCR-NEXT: v_writelane_b32 v1, s91, 52 -; FLATSCR-NEXT: v_writelane_b32 v1, s92, 53 -; FLATSCR-NEXT: v_writelane_b32 v1, s93, 54 -; FLATSCR-NEXT: v_writelane_b32 v1, s94, 55 -; FLATSCR-NEXT: v_writelane_b32 v1, s95, 56 -; FLATSCR-NEXT: v_writelane_b32 v1, s96, 57 -; FLATSCR-NEXT: v_writelane_b32 v1, s97, 58 -; FLATSCR-NEXT: v_writelane_b32 v1, s98, 59 -; FLATSCR-NEXT: v_writelane_b32 v1, s99, 60 -; FLATSCR-NEXT: v_writelane_b32 v1, s100, 61 -; FLATSCR-NEXT: v_writelane_b32 v1, s101, 62 -; FLATSCR-NEXT: v_writelane_b32 v1, s102, 63 +; FLATSCR-NEXT: v_writelane_b32 v1, s48, 1 +; FLATSCR-NEXT: v_writelane_b32 v1, s49, 2 +; FLATSCR-NEXT: v_writelane_b32 v1, s50, 3 +; FLATSCR-NEXT: v_writelane_b32 v1, s51, 4 +; FLATSCR-NEXT: v_writelane_b32 v1, s52, 5 +; FLATSCR-NEXT: v_writelane_b32 v1, s53, 6 +; FLATSCR-NEXT: v_writelane_b32 v1, s54, 7 +; FLATSCR-NEXT: v_writelane_b32 v1, s55, 8 +; FLATSCR-NEXT: v_writelane_b32 v1, s64, 9 +; FLATSCR-NEXT: v_writelane_b32 v1, s65, 10 +; FLATSCR-NEXT: v_writelane_b32 v1, s66, 11 +; FLATSCR-NEXT: v_writelane_b32 v1, s67, 12 +; FLATSCR-NEXT: v_writelane_b32 v1, s68, 13 +; FLATSCR-NEXT: v_writelane_b32 v1, s69, 14 +; FLATSCR-NEXT: v_writelane_b32 v1, s70, 15 +; FLATSCR-NEXT: v_writelane_b32 v1, s71, 16 +; FLATSCR-NEXT: v_writelane_b32 v1, s80, 17 +; FLATSCR-NEXT: v_writelane_b32 v1, s81, 18 +; FLATSCR-NEXT: v_writelane_b32 v1, s82, 19 +; FLATSCR-NEXT: v_writelane_b32 v1, s83, 20 +; FLATSCR-NEXT: v_writelane_b32 v1, s84, 21 +; FLATSCR-NEXT: v_writelane_b32 v1, s85, 22 +; FLATSCR-NEXT: v_writelane_b32 v1, s86, 23 +; FLATSCR-NEXT: v_writelane_b32 v1, s87, 24 +; FLATSCR-NEXT: v_writelane_b32 v1, s96, 25 +; FLATSCR-NEXT: v_writelane_b32 v1, s97, 26 +; FLATSCR-NEXT: v_writelane_b32 v1, s98, 27 +; FLATSCR-NEXT: v_writelane_b32 v1, s99, 28 +; FLATSCR-NEXT: v_writelane_b32 v1, s100, 29 +; FLATSCR-NEXT: v_writelane_b32 v1, s101, 30 +; FLATSCR-NEXT: v_writelane_b32 v1, s102, 31 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -1215,69 +865,37 @@ define void @no_new_vgpr_for_fp_csr() #1 { ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload -; FLATSCR-NEXT: v_readlane_b32 s102, v1, 63 -; FLATSCR-NEXT: v_readlane_b32 s101, v1, 62 -; FLATSCR-NEXT: v_readlane_b32 s100, v1, 61 -; FLATSCR-NEXT: v_readlane_b32 s99, v1, 60 -; FLATSCR-NEXT: v_readlane_b32 s98, v1, 59 -; FLATSCR-NEXT: v_readlane_b32 s97, v1, 58 -; FLATSCR-NEXT: v_readlane_b32 s96, v1, 57 -; FLATSCR-NEXT: v_readlane_b32 s95, v1, 56 -; FLATSCR-NEXT: v_readlane_b32 s94, v1, 55 -; FLATSCR-NEXT: v_readlane_b32 s93, v1, 54 -; FLATSCR-NEXT: v_readlane_b32 s92, v1, 53 -; FLATSCR-NEXT: v_readlane_b32 s91, v1, 52 -; FLATSCR-NEXT: v_readlane_b32 s90, v1, 51 -; FLATSCR-NEXT: v_readlane_b32 s89, v1, 50 -; FLATSCR-NEXT: v_readlane_b32 s88, v1, 49 -; FLATSCR-NEXT: v_readlane_b32 s87, v1, 48 -; FLATSCR-NEXT: v_readlane_b32 s86, v1, 47 -; FLATSCR-NEXT: v_readlane_b32 s85, v1, 46 -; FLATSCR-NEXT: v_readlane_b32 s84, v1, 45 -; FLATSCR-NEXT: v_readlane_b32 s83, v1, 44 -; FLATSCR-NEXT: v_readlane_b32 s82, v1, 43 -; FLATSCR-NEXT: v_readlane_b32 s81, v1, 42 -; FLATSCR-NEXT: v_readlane_b32 s80, v1, 41 -; FLATSCR-NEXT: v_readlane_b32 s79, v1, 40 -; FLATSCR-NEXT: v_readlane_b32 s78, v1, 39 -; FLATSCR-NEXT: v_readlane_b32 s77, v1, 38 -; FLATSCR-NEXT: v_readlane_b32 s76, v1, 37 -; FLATSCR-NEXT: v_readlane_b32 s75, v1, 36 -; FLATSCR-NEXT: v_readlane_b32 s74, v1, 35 -; FLATSCR-NEXT: v_readlane_b32 s73, v1, 34 -; FLATSCR-NEXT: v_readlane_b32 s72, v1, 33 -; FLATSCR-NEXT: v_readlane_b32 s71, v1, 32 -; FLATSCR-NEXT: v_readlane_b32 s70, v1, 31 -; FLATSCR-NEXT: v_readlane_b32 s69, v1, 30 -; FLATSCR-NEXT: v_readlane_b32 s68, v1, 29 -; FLATSCR-NEXT: v_readlane_b32 s67, v1, 28 -; FLATSCR-NEXT: v_readlane_b32 s66, v1, 27 -; FLATSCR-NEXT: v_readlane_b32 s65, v1, 26 -; FLATSCR-NEXT: v_readlane_b32 s64, v1, 25 -; FLATSCR-NEXT: v_readlane_b32 s63, v1, 24 -; FLATSCR-NEXT: v_readlane_b32 s62, v1, 23 -; FLATSCR-NEXT: v_readlane_b32 s61, v1, 22 -; FLATSCR-NEXT: v_readlane_b32 s60, v1, 21 -; FLATSCR-NEXT: v_readlane_b32 s59, v1, 20 -; FLATSCR-NEXT: v_readlane_b32 s58, v1, 19 -; FLATSCR-NEXT: v_readlane_b32 s57, v1, 18 -; FLATSCR-NEXT: v_readlane_b32 s56, v1, 17 -; FLATSCR-NEXT: v_readlane_b32 s55, v1, 16 -; FLATSCR-NEXT: v_readlane_b32 s54, v1, 15 -; FLATSCR-NEXT: v_readlane_b32 s53, v1, 14 -; FLATSCR-NEXT: v_readlane_b32 s52, v1, 13 -; FLATSCR-NEXT: v_readlane_b32 s51, v1, 12 -; FLATSCR-NEXT: v_readlane_b32 s50, v1, 11 -; FLATSCR-NEXT: v_readlane_b32 s49, v1, 10 -; FLATSCR-NEXT: v_readlane_b32 s48, v1, 9 -; FLATSCR-NEXT: v_readlane_b32 s47, v1, 8 -; FLATSCR-NEXT: v_readlane_b32 s46, v1, 7 -; FLATSCR-NEXT: v_readlane_b32 s45, v1, 6 -; FLATSCR-NEXT: v_readlane_b32 s44, v1, 5 -; FLATSCR-NEXT: v_readlane_b32 s43, v1, 4 -; FLATSCR-NEXT: v_readlane_b32 s42, v1, 3 -; FLATSCR-NEXT: v_readlane_b32 s41, v1, 2 -; FLATSCR-NEXT: v_readlane_b32 s40, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s102, v1, 31 +; FLATSCR-NEXT: v_readlane_b32 s101, v1, 30 +; FLATSCR-NEXT: v_readlane_b32 s100, v1, 29 +; FLATSCR-NEXT: v_readlane_b32 s99, v1, 28 +; FLATSCR-NEXT: v_readlane_b32 s98, v1, 27 +; FLATSCR-NEXT: v_readlane_b32 s97, v1, 26 +; FLATSCR-NEXT: v_readlane_b32 s96, v1, 25 +; FLATSCR-NEXT: v_readlane_b32 s87, v1, 24 +; FLATSCR-NEXT: v_readlane_b32 s86, v1, 23 +; FLATSCR-NEXT: v_readlane_b32 s85, v1, 22 +; FLATSCR-NEXT: v_readlane_b32 s84, v1, 21 +; FLATSCR-NEXT: v_readlane_b32 s83, v1, 20 +; FLATSCR-NEXT: v_readlane_b32 s82, v1, 19 +; FLATSCR-NEXT: v_readlane_b32 s81, v1, 18 +; FLATSCR-NEXT: v_readlane_b32 s80, v1, 17 +; FLATSCR-NEXT: v_readlane_b32 s71, v1, 16 +; FLATSCR-NEXT: v_readlane_b32 s70, v1, 15 +; FLATSCR-NEXT: v_readlane_b32 s69, v1, 14 +; FLATSCR-NEXT: v_readlane_b32 s68, v1, 13 +; FLATSCR-NEXT: v_readlane_b32 s67, v1, 12 +; FLATSCR-NEXT: v_readlane_b32 s66, v1, 11 +; FLATSCR-NEXT: v_readlane_b32 s65, v1, 10 +; FLATSCR-NEXT: v_readlane_b32 s64, v1, 9 +; FLATSCR-NEXT: v_readlane_b32 s55, v1, 8 +; FLATSCR-NEXT: v_readlane_b32 s54, v1, 7 +; FLATSCR-NEXT: v_readlane_b32 s53, v1, 6 +; FLATSCR-NEXT: v_readlane_b32 s52, v1, 5 +; FLATSCR-NEXT: v_readlane_b32 s51, v1, 4 +; FLATSCR-NEXT: v_readlane_b32 s50, v1, 3 +; FLATSCR-NEXT: v_readlane_b32 s49, v1, 2 +; FLATSCR-NEXT: v_readlane_b32 s48, v1, 1 ; FLATSCR-NEXT: v_readlane_b32 s39, v1, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 @@ -1346,7 +964,7 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_mov_b32 vcc_lo, s33 +; MUBUF-NEXT: s_mov_b32 s40, s33 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -1365,14 +983,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] -; MUBUF-NEXT: s_mov_b32 s33, vcc_lo +; MUBUF-NEXT: s_mov_b32 s33, s40 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp: ; FLATSCR: ; %bb.0: ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 vcc_lo, s33 +; FLATSCR-NEXT: s_mov_b32 s40, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:4 ; 4-byte Folded Spill @@ -1391,7 +1009,7 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: s_mov_b32 s33, vcc_lo +; FLATSCR-NEXT: s_mov_b32 s33, s40 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) @@ -1412,7 +1030,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_mov_b32 vcc_lo, s33 +; MUBUF-NEXT: s_mov_b32 s40, s33 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -1434,14 +1052,14 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] -; MUBUF-NEXT: s_mov_b32 s33, vcc_lo +; MUBUF-NEXT: s_mov_b32 s33, s40 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: ; FLATSCR: ; %bb.0: ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 vcc_lo, s33 +; FLATSCR-NEXT: s_mov_b32 s40, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill @@ -1463,7 +1081,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: s_mov_b32 s33, vcc_lo +; FLATSCR-NEXT: s_mov_b32 s33, s40 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) @@ -1491,7 +1109,7 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) ; MUBUF-LABEL: scratch_reg_needed_mubuf_offset: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_mov_b32 vcc_lo, s33 +; MUBUF-NEXT: s_mov_b32 s40, s33 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: s_add_i32 s6, s33, 0x40100 @@ -1517,14 +1135,14 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) ; MUBUF-NEXT: s_add_i32 s6, s33, 0x40100 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s6 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] -; MUBUF-NEXT: s_mov_b32 s33, vcc_lo +; MUBUF-NEXT: s_mov_b32 s33, s40 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: scratch_reg_needed_mubuf_offset: ; FLATSCR: ; %bb.0: ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 vcc_lo, s33 +; FLATSCR-NEXT: s_mov_b32 s40, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: s_add_i32 s2, s33, 0x1004 @@ -1550,7 +1168,7 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) ; FLATSCR-NEXT: s_add_i32 s2, s33, 0x1004 ; FLATSCR-NEXT: scratch_load_dword v40, off, s2 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: s_mov_b32 s33, vcc_lo +; FLATSCR-NEXT: s_mov_b32 s33, s40 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) @@ -1650,22 +1268,15 @@ define void @callee_need_to_spill_fp_to_memory() #3 { ; MUBUF-LABEL: callee_need_to_spill_fp_to_memory: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s40, s33 ; MUBUF-NEXT: s_mov_b32 s33, s32 -; MUBUF-NEXT: v_mov_b32_e32 v0, s4 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber nonpreserved SGPRs ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber all VGPRs ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload -; MUBUF-NEXT: s_addk_i32 s32, 0x200 -; MUBUF-NEXT: s_mov_b32 s32, s33 -; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_readfirstlane_b32 s4, v0 -; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_mov_b32 s33, s40 ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: callee_need_to_spill_fp_to_memory: @@ -1707,147 +1318,80 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { ; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], s33 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] -; MUBUF-NEXT: v_mov_b32_e32 v0, s4 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; MUBUF-NEXT: s_addk_i32 s32, 0x300 +; MUBUF-NEXT: v_writelane_b32 v39, s4, 32 +; MUBUF-NEXT: s_addk_i32 s32, 0x200 ; MUBUF-NEXT: v_writelane_b32 v39, s39, 0 -; MUBUF-NEXT: v_writelane_b32 v39, s40, 1 -; MUBUF-NEXT: v_writelane_b32 v39, s41, 2 -; MUBUF-NEXT: v_writelane_b32 v39, s42, 3 -; MUBUF-NEXT: v_writelane_b32 v39, s43, 4 -; MUBUF-NEXT: v_writelane_b32 v39, s44, 5 -; MUBUF-NEXT: v_writelane_b32 v39, s45, 6 -; MUBUF-NEXT: v_writelane_b32 v39, s46, 7 -; MUBUF-NEXT: v_writelane_b32 v39, s47, 8 -; MUBUF-NEXT: v_writelane_b32 v39, s48, 9 -; MUBUF-NEXT: v_writelane_b32 v39, s49, 10 -; MUBUF-NEXT: v_writelane_b32 v39, s50, 11 -; MUBUF-NEXT: v_writelane_b32 v39, s51, 12 -; MUBUF-NEXT: v_writelane_b32 v39, s52, 13 -; MUBUF-NEXT: v_writelane_b32 v39, s53, 14 -; MUBUF-NEXT: v_writelane_b32 v39, s54, 15 -; MUBUF-NEXT: v_writelane_b32 v39, s55, 16 -; MUBUF-NEXT: v_writelane_b32 v39, s56, 17 -; MUBUF-NEXT: v_writelane_b32 v39, s57, 18 -; MUBUF-NEXT: v_writelane_b32 v39, s58, 19 -; MUBUF-NEXT: v_writelane_b32 v39, s59, 20 -; MUBUF-NEXT: v_writelane_b32 v39, s60, 21 -; MUBUF-NEXT: v_writelane_b32 v39, s61, 22 -; MUBUF-NEXT: v_writelane_b32 v39, s62, 23 -; MUBUF-NEXT: v_writelane_b32 v39, s63, 24 -; MUBUF-NEXT: v_writelane_b32 v39, s64, 25 -; MUBUF-NEXT: v_writelane_b32 v39, s65, 26 -; MUBUF-NEXT: v_writelane_b32 v39, s66, 27 -; MUBUF-NEXT: v_writelane_b32 v39, s67, 28 -; MUBUF-NEXT: v_writelane_b32 v39, s68, 29 -; MUBUF-NEXT: v_writelane_b32 v39, s69, 30 -; MUBUF-NEXT: v_writelane_b32 v39, s70, 31 -; MUBUF-NEXT: v_writelane_b32 v39, s71, 32 -; MUBUF-NEXT: v_writelane_b32 v39, s72, 33 -; MUBUF-NEXT: v_writelane_b32 v39, s73, 34 -; MUBUF-NEXT: v_writelane_b32 v39, s74, 35 -; MUBUF-NEXT: v_writelane_b32 v39, s75, 36 -; MUBUF-NEXT: v_writelane_b32 v39, s76, 37 -; MUBUF-NEXT: v_writelane_b32 v39, s77, 38 -; MUBUF-NEXT: v_writelane_b32 v39, s78, 39 -; MUBUF-NEXT: v_writelane_b32 v39, s79, 40 -; MUBUF-NEXT: v_writelane_b32 v39, s80, 41 -; MUBUF-NEXT: v_writelane_b32 v39, s81, 42 -; MUBUF-NEXT: v_writelane_b32 v39, s82, 43 -; MUBUF-NEXT: v_writelane_b32 v39, s83, 44 -; MUBUF-NEXT: v_writelane_b32 v39, s84, 45 -; MUBUF-NEXT: v_writelane_b32 v39, s85, 46 -; MUBUF-NEXT: v_writelane_b32 v39, s86, 47 -; MUBUF-NEXT: v_writelane_b32 v39, s87, 48 -; MUBUF-NEXT: v_writelane_b32 v39, s88, 49 -; MUBUF-NEXT: v_writelane_b32 v39, s89, 50 -; MUBUF-NEXT: v_writelane_b32 v39, s90, 51 -; MUBUF-NEXT: v_writelane_b32 v39, s91, 52 -; MUBUF-NEXT: v_writelane_b32 v39, s92, 53 -; MUBUF-NEXT: v_writelane_b32 v39, s93, 54 -; MUBUF-NEXT: v_writelane_b32 v39, s94, 55 -; MUBUF-NEXT: v_writelane_b32 v39, s95, 56 -; MUBUF-NEXT: v_writelane_b32 v39, s96, 57 -; MUBUF-NEXT: v_writelane_b32 v39, s97, 58 -; MUBUF-NEXT: v_writelane_b32 v39, s98, 59 -; MUBUF-NEXT: v_writelane_b32 v39, s99, 60 -; MUBUF-NEXT: v_writelane_b32 v39, s100, 61 -; MUBUF-NEXT: v_writelane_b32 v39, s101, 62 -; MUBUF-NEXT: v_writelane_b32 v39, s102, 63 +; MUBUF-NEXT: v_writelane_b32 v39, s48, 1 +; MUBUF-NEXT: v_writelane_b32 v39, s49, 2 +; MUBUF-NEXT: v_writelane_b32 v39, s50, 3 +; MUBUF-NEXT: v_writelane_b32 v39, s51, 4 +; MUBUF-NEXT: v_writelane_b32 v39, s52, 5 +; MUBUF-NEXT: v_writelane_b32 v39, s53, 6 +; MUBUF-NEXT: v_writelane_b32 v39, s54, 7 +; MUBUF-NEXT: v_writelane_b32 v39, s55, 8 +; MUBUF-NEXT: v_writelane_b32 v39, s64, 9 +; MUBUF-NEXT: v_writelane_b32 v39, s65, 10 +; MUBUF-NEXT: v_writelane_b32 v39, s66, 11 +; MUBUF-NEXT: v_writelane_b32 v39, s67, 12 +; MUBUF-NEXT: v_writelane_b32 v39, s68, 13 +; MUBUF-NEXT: v_writelane_b32 v39, s69, 14 +; MUBUF-NEXT: v_writelane_b32 v39, s70, 15 +; MUBUF-NEXT: v_writelane_b32 v39, s71, 16 +; MUBUF-NEXT: v_writelane_b32 v39, s80, 17 +; MUBUF-NEXT: v_writelane_b32 v39, s81, 18 +; MUBUF-NEXT: v_writelane_b32 v39, s82, 19 +; MUBUF-NEXT: v_writelane_b32 v39, s83, 20 +; MUBUF-NEXT: v_writelane_b32 v39, s84, 21 +; MUBUF-NEXT: v_writelane_b32 v39, s85, 22 +; MUBUF-NEXT: v_writelane_b32 v39, s86, 23 +; MUBUF-NEXT: v_writelane_b32 v39, s87, 24 +; MUBUF-NEXT: v_writelane_b32 v39, s96, 25 +; MUBUF-NEXT: v_writelane_b32 v39, s97, 26 +; MUBUF-NEXT: v_writelane_b32 v39, s98, 27 +; MUBUF-NEXT: v_writelane_b32 v39, s99, 28 +; MUBUF-NEXT: v_writelane_b32 v39, s100, 29 +; MUBUF-NEXT: v_writelane_b32 v39, s101, 30 +; MUBUF-NEXT: v_writelane_b32 v39, s102, 31 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber all VGPRs except CSR v40 ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; MUBUF-NEXT: v_readlane_b32 s102, v39, 63 -; MUBUF-NEXT: v_readlane_b32 s101, v39, 62 -; MUBUF-NEXT: v_readlane_b32 s100, v39, 61 -; MUBUF-NEXT: v_readlane_b32 s99, v39, 60 -; MUBUF-NEXT: v_readlane_b32 s98, v39, 59 -; MUBUF-NEXT: v_readlane_b32 s97, v39, 58 -; MUBUF-NEXT: v_readlane_b32 s96, v39, 57 -; MUBUF-NEXT: v_readlane_b32 s95, v39, 56 -; MUBUF-NEXT: v_readlane_b32 s94, v39, 55 -; MUBUF-NEXT: v_readlane_b32 s93, v39, 54 -; MUBUF-NEXT: v_readlane_b32 s92, v39, 53 -; MUBUF-NEXT: v_readlane_b32 s91, v39, 52 -; MUBUF-NEXT: v_readlane_b32 s90, v39, 51 -; MUBUF-NEXT: v_readlane_b32 s89, v39, 50 -; MUBUF-NEXT: v_readlane_b32 s88, v39, 49 -; MUBUF-NEXT: v_readlane_b32 s87, v39, 48 -; MUBUF-NEXT: v_readlane_b32 s86, v39, 47 -; MUBUF-NEXT: v_readlane_b32 s85, v39, 46 -; MUBUF-NEXT: v_readlane_b32 s84, v39, 45 -; MUBUF-NEXT: v_readlane_b32 s83, v39, 44 -; MUBUF-NEXT: v_readlane_b32 s82, v39, 43 -; MUBUF-NEXT: v_readlane_b32 s81, v39, 42 -; MUBUF-NEXT: v_readlane_b32 s80, v39, 41 -; MUBUF-NEXT: v_readlane_b32 s79, v39, 40 -; MUBUF-NEXT: v_readlane_b32 s78, v39, 39 -; MUBUF-NEXT: v_readlane_b32 s77, v39, 38 -; MUBUF-NEXT: v_readlane_b32 s76, v39, 37 -; MUBUF-NEXT: v_readlane_b32 s75, v39, 36 -; MUBUF-NEXT: v_readlane_b32 s74, v39, 35 -; MUBUF-NEXT: v_readlane_b32 s73, v39, 34 -; MUBUF-NEXT: v_readlane_b32 s72, v39, 33 -; MUBUF-NEXT: v_readlane_b32 s71, v39, 32 -; MUBUF-NEXT: v_readlane_b32 s70, v39, 31 -; MUBUF-NEXT: v_readlane_b32 s69, v39, 30 -; MUBUF-NEXT: v_readlane_b32 s68, v39, 29 -; MUBUF-NEXT: v_readlane_b32 s67, v39, 28 -; MUBUF-NEXT: v_readlane_b32 s66, v39, 27 -; MUBUF-NEXT: v_readlane_b32 s65, v39, 26 -; MUBUF-NEXT: v_readlane_b32 s64, v39, 25 -; MUBUF-NEXT: v_readlane_b32 s63, v39, 24 -; MUBUF-NEXT: v_readlane_b32 s62, v39, 23 -; MUBUF-NEXT: v_readlane_b32 s61, v39, 22 -; MUBUF-NEXT: v_readlane_b32 s60, v39, 21 -; MUBUF-NEXT: v_readlane_b32 s59, v39, 20 -; MUBUF-NEXT: v_readlane_b32 s58, v39, 19 -; MUBUF-NEXT: v_readlane_b32 s57, v39, 18 -; MUBUF-NEXT: v_readlane_b32 s56, v39, 17 -; MUBUF-NEXT: v_readlane_b32 s55, v39, 16 -; MUBUF-NEXT: v_readlane_b32 s54, v39, 15 -; MUBUF-NEXT: v_readlane_b32 s53, v39, 14 -; MUBUF-NEXT: v_readlane_b32 s52, v39, 13 -; MUBUF-NEXT: v_readlane_b32 s51, v39, 12 -; MUBUF-NEXT: v_readlane_b32 s50, v39, 11 -; MUBUF-NEXT: v_readlane_b32 s49, v39, 10 -; MUBUF-NEXT: v_readlane_b32 s48, v39, 9 -; MUBUF-NEXT: v_readlane_b32 s47, v39, 8 -; MUBUF-NEXT: v_readlane_b32 s46, v39, 7 -; MUBUF-NEXT: v_readlane_b32 s45, v39, 6 -; MUBUF-NEXT: v_readlane_b32 s44, v39, 5 -; MUBUF-NEXT: v_readlane_b32 s43, v39, 4 -; MUBUF-NEXT: v_readlane_b32 s42, v39, 3 -; MUBUF-NEXT: v_readlane_b32 s41, v39, 2 -; MUBUF-NEXT: v_readlane_b32 s40, v39, 1 +; MUBUF-NEXT: v_readlane_b32 s102, v39, 31 +; MUBUF-NEXT: v_readlane_b32 s101, v39, 30 +; MUBUF-NEXT: v_readlane_b32 s100, v39, 29 +; MUBUF-NEXT: v_readlane_b32 s99, v39, 28 +; MUBUF-NEXT: v_readlane_b32 s98, v39, 27 +; MUBUF-NEXT: v_readlane_b32 s97, v39, 26 +; MUBUF-NEXT: v_readlane_b32 s96, v39, 25 +; MUBUF-NEXT: v_readlane_b32 s87, v39, 24 +; MUBUF-NEXT: v_readlane_b32 s86, v39, 23 +; MUBUF-NEXT: v_readlane_b32 s85, v39, 22 +; MUBUF-NEXT: v_readlane_b32 s84, v39, 21 +; MUBUF-NEXT: v_readlane_b32 s83, v39, 20 +; MUBUF-NEXT: v_readlane_b32 s82, v39, 19 +; MUBUF-NEXT: v_readlane_b32 s81, v39, 18 +; MUBUF-NEXT: v_readlane_b32 s80, v39, 17 +; MUBUF-NEXT: v_readlane_b32 s71, v39, 16 +; MUBUF-NEXT: v_readlane_b32 s70, v39, 15 +; MUBUF-NEXT: v_readlane_b32 s69, v39, 14 +; MUBUF-NEXT: v_readlane_b32 s68, v39, 13 +; MUBUF-NEXT: v_readlane_b32 s67, v39, 12 +; MUBUF-NEXT: v_readlane_b32 s66, v39, 11 +; MUBUF-NEXT: v_readlane_b32 s65, v39, 10 +; MUBUF-NEXT: v_readlane_b32 s64, v39, 9 +; MUBUF-NEXT: v_readlane_b32 s55, v39, 8 +; MUBUF-NEXT: v_readlane_b32 s54, v39, 7 +; MUBUF-NEXT: v_readlane_b32 s53, v39, 6 +; MUBUF-NEXT: v_readlane_b32 s52, v39, 5 +; MUBUF-NEXT: v_readlane_b32 s51, v39, 4 +; MUBUF-NEXT: v_readlane_b32 s50, v39, 3 +; MUBUF-NEXT: v_readlane_b32 s49, v39, 2 +; MUBUF-NEXT: v_readlane_b32 s48, v39, 1 ; MUBUF-NEXT: v_readlane_b32 s39, v39, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 -; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_readfirstlane_b32 s4, v0 +; MUBUF-NEXT: v_readlane_b32 s4, v39, 32 ; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v39, off, s[0:3], s33 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] @@ -1865,138 +1409,74 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_add_i32 s32, s32, 8 ; FLATSCR-NEXT: v_writelane_b32 v39, s39, 0 -; FLATSCR-NEXT: v_writelane_b32 v39, s40, 1 -; FLATSCR-NEXT: v_writelane_b32 v39, s41, 2 -; FLATSCR-NEXT: v_writelane_b32 v39, s42, 3 -; FLATSCR-NEXT: v_writelane_b32 v39, s43, 4 -; FLATSCR-NEXT: v_writelane_b32 v39, s44, 5 -; FLATSCR-NEXT: v_writelane_b32 v39, s45, 6 -; FLATSCR-NEXT: v_writelane_b32 v39, s46, 7 -; FLATSCR-NEXT: v_writelane_b32 v39, s47, 8 -; FLATSCR-NEXT: v_writelane_b32 v39, s48, 9 -; FLATSCR-NEXT: v_writelane_b32 v39, s49, 10 -; FLATSCR-NEXT: v_writelane_b32 v39, s50, 11 -; FLATSCR-NEXT: v_writelane_b32 v39, s51, 12 -; FLATSCR-NEXT: v_writelane_b32 v39, s52, 13 -; FLATSCR-NEXT: v_writelane_b32 v39, s53, 14 -; FLATSCR-NEXT: v_writelane_b32 v39, s54, 15 -; FLATSCR-NEXT: v_writelane_b32 v39, s55, 16 -; FLATSCR-NEXT: v_writelane_b32 v39, s56, 17 -; FLATSCR-NEXT: v_writelane_b32 v39, s57, 18 -; FLATSCR-NEXT: v_writelane_b32 v39, s58, 19 -; FLATSCR-NEXT: v_writelane_b32 v39, s59, 20 -; FLATSCR-NEXT: v_writelane_b32 v39, s60, 21 -; FLATSCR-NEXT: v_writelane_b32 v39, s61, 22 -; FLATSCR-NEXT: v_writelane_b32 v39, s62, 23 -; FLATSCR-NEXT: v_writelane_b32 v39, s63, 24 -; FLATSCR-NEXT: v_writelane_b32 v39, s64, 25 -; FLATSCR-NEXT: v_writelane_b32 v39, s65, 26 -; FLATSCR-NEXT: v_writelane_b32 v39, s66, 27 -; FLATSCR-NEXT: v_writelane_b32 v39, s67, 28 -; FLATSCR-NEXT: v_writelane_b32 v39, s68, 29 -; FLATSCR-NEXT: v_writelane_b32 v39, s69, 30 -; FLATSCR-NEXT: v_writelane_b32 v39, s70, 31 -; FLATSCR-NEXT: v_writelane_b32 v39, s71, 32 -; FLATSCR-NEXT: v_writelane_b32 v39, s72, 33 -; FLATSCR-NEXT: v_writelane_b32 v39, s73, 34 -; FLATSCR-NEXT: v_writelane_b32 v39, s74, 35 -; FLATSCR-NEXT: v_writelane_b32 v39, s75, 36 -; FLATSCR-NEXT: v_writelane_b32 v39, s76, 37 -; FLATSCR-NEXT: v_writelane_b32 v39, s77, 38 -; FLATSCR-NEXT: v_writelane_b32 v39, s78, 39 -; FLATSCR-NEXT: v_writelane_b32 v39, s79, 40 -; FLATSCR-NEXT: v_writelane_b32 v39, s80, 41 -; FLATSCR-NEXT: v_writelane_b32 v39, s81, 42 -; FLATSCR-NEXT: v_writelane_b32 v39, s82, 43 -; FLATSCR-NEXT: v_writelane_b32 v39, s83, 44 -; FLATSCR-NEXT: v_writelane_b32 v39, s84, 45 -; FLATSCR-NEXT: v_writelane_b32 v39, s85, 46 -; FLATSCR-NEXT: v_writelane_b32 v39, s86, 47 -; FLATSCR-NEXT: v_writelane_b32 v39, s87, 48 -; FLATSCR-NEXT: v_writelane_b32 v39, s88, 49 -; FLATSCR-NEXT: v_writelane_b32 v39, s89, 50 -; FLATSCR-NEXT: v_writelane_b32 v39, s90, 51 -; FLATSCR-NEXT: v_writelane_b32 v39, s91, 52 -; FLATSCR-NEXT: v_writelane_b32 v39, s92, 53 -; FLATSCR-NEXT: v_writelane_b32 v39, s93, 54 -; FLATSCR-NEXT: v_writelane_b32 v39, s94, 55 -; FLATSCR-NEXT: v_writelane_b32 v39, s95, 56 -; FLATSCR-NEXT: v_writelane_b32 v39, s96, 57 -; FLATSCR-NEXT: v_writelane_b32 v39, s97, 58 -; FLATSCR-NEXT: v_writelane_b32 v39, s98, 59 -; FLATSCR-NEXT: v_writelane_b32 v39, s99, 60 -; FLATSCR-NEXT: v_writelane_b32 v39, s100, 61 -; FLATSCR-NEXT: v_writelane_b32 v39, s101, 62 -; FLATSCR-NEXT: v_writelane_b32 v39, s102, 63 +; FLATSCR-NEXT: v_writelane_b32 v39, s48, 1 +; FLATSCR-NEXT: v_writelane_b32 v39, s49, 2 +; FLATSCR-NEXT: v_writelane_b32 v39, s50, 3 +; FLATSCR-NEXT: v_writelane_b32 v39, s51, 4 +; FLATSCR-NEXT: v_writelane_b32 v39, s52, 5 +; FLATSCR-NEXT: v_writelane_b32 v39, s53, 6 +; FLATSCR-NEXT: v_writelane_b32 v39, s54, 7 +; FLATSCR-NEXT: v_writelane_b32 v39, s55, 8 +; FLATSCR-NEXT: v_writelane_b32 v39, s64, 9 +; FLATSCR-NEXT: v_writelane_b32 v39, s65, 10 +; FLATSCR-NEXT: v_writelane_b32 v39, s66, 11 +; FLATSCR-NEXT: v_writelane_b32 v39, s67, 12 +; FLATSCR-NEXT: v_writelane_b32 v39, s68, 13 +; FLATSCR-NEXT: v_writelane_b32 v39, s69, 14 +; FLATSCR-NEXT: v_writelane_b32 v39, s70, 15 +; FLATSCR-NEXT: v_writelane_b32 v39, s71, 16 +; FLATSCR-NEXT: v_writelane_b32 v39, s80, 17 +; FLATSCR-NEXT: v_writelane_b32 v39, s81, 18 +; FLATSCR-NEXT: v_writelane_b32 v39, s82, 19 +; FLATSCR-NEXT: v_writelane_b32 v39, s83, 20 +; FLATSCR-NEXT: v_writelane_b32 v39, s84, 21 +; FLATSCR-NEXT: v_writelane_b32 v39, s85, 22 +; FLATSCR-NEXT: v_writelane_b32 v39, s86, 23 +; FLATSCR-NEXT: v_writelane_b32 v39, s87, 24 +; FLATSCR-NEXT: v_writelane_b32 v39, s96, 25 +; FLATSCR-NEXT: v_writelane_b32 v39, s97, 26 +; FLATSCR-NEXT: v_writelane_b32 v39, s98, 27 +; FLATSCR-NEXT: v_writelane_b32 v39, s99, 28 +; FLATSCR-NEXT: v_writelane_b32 v39, s100, 29 +; FLATSCR-NEXT: v_writelane_b32 v39, s101, 30 +; FLATSCR-NEXT: v_writelane_b32 v39, s102, 31 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber all VGPRs except CSR v40 ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s102, v39, 63 -; FLATSCR-NEXT: v_readlane_b32 s101, v39, 62 -; FLATSCR-NEXT: v_readlane_b32 s100, v39, 61 -; FLATSCR-NEXT: v_readlane_b32 s99, v39, 60 -; FLATSCR-NEXT: v_readlane_b32 s98, v39, 59 -; FLATSCR-NEXT: v_readlane_b32 s97, v39, 58 -; FLATSCR-NEXT: v_readlane_b32 s96, v39, 57 -; FLATSCR-NEXT: v_readlane_b32 s95, v39, 56 -; FLATSCR-NEXT: v_readlane_b32 s94, v39, 55 -; FLATSCR-NEXT: v_readlane_b32 s93, v39, 54 -; FLATSCR-NEXT: v_readlane_b32 s92, v39, 53 -; FLATSCR-NEXT: v_readlane_b32 s91, v39, 52 -; FLATSCR-NEXT: v_readlane_b32 s90, v39, 51 -; FLATSCR-NEXT: v_readlane_b32 s89, v39, 50 -; FLATSCR-NEXT: v_readlane_b32 s88, v39, 49 -; FLATSCR-NEXT: v_readlane_b32 s87, v39, 48 -; FLATSCR-NEXT: v_readlane_b32 s86, v39, 47 -; FLATSCR-NEXT: v_readlane_b32 s85, v39, 46 -; FLATSCR-NEXT: v_readlane_b32 s84, v39, 45 -; FLATSCR-NEXT: v_readlane_b32 s83, v39, 44 -; FLATSCR-NEXT: v_readlane_b32 s82, v39, 43 -; FLATSCR-NEXT: v_readlane_b32 s81, v39, 42 -; FLATSCR-NEXT: v_readlane_b32 s80, v39, 41 -; FLATSCR-NEXT: v_readlane_b32 s79, v39, 40 -; FLATSCR-NEXT: v_readlane_b32 s78, v39, 39 -; FLATSCR-NEXT: v_readlane_b32 s77, v39, 38 -; FLATSCR-NEXT: v_readlane_b32 s76, v39, 37 -; FLATSCR-NEXT: v_readlane_b32 s75, v39, 36 -; FLATSCR-NEXT: v_readlane_b32 s74, v39, 35 -; FLATSCR-NEXT: v_readlane_b32 s73, v39, 34 -; FLATSCR-NEXT: v_readlane_b32 s72, v39, 33 -; FLATSCR-NEXT: v_readlane_b32 s71, v39, 32 -; FLATSCR-NEXT: v_readlane_b32 s70, v39, 31 -; FLATSCR-NEXT: v_readlane_b32 s69, v39, 30 -; FLATSCR-NEXT: v_readlane_b32 s68, v39, 29 -; FLATSCR-NEXT: v_readlane_b32 s67, v39, 28 -; FLATSCR-NEXT: v_readlane_b32 s66, v39, 27 -; FLATSCR-NEXT: v_readlane_b32 s65, v39, 26 -; FLATSCR-NEXT: v_readlane_b32 s64, v39, 25 -; FLATSCR-NEXT: v_readlane_b32 s63, v39, 24 -; FLATSCR-NEXT: v_readlane_b32 s62, v39, 23 -; FLATSCR-NEXT: v_readlane_b32 s61, v39, 22 -; FLATSCR-NEXT: v_readlane_b32 s60, v39, 21 -; FLATSCR-NEXT: v_readlane_b32 s59, v39, 20 -; FLATSCR-NEXT: v_readlane_b32 s58, v39, 19 -; FLATSCR-NEXT: v_readlane_b32 s57, v39, 18 -; FLATSCR-NEXT: v_readlane_b32 s56, v39, 17 -; FLATSCR-NEXT: v_readlane_b32 s55, v39, 16 -; FLATSCR-NEXT: v_readlane_b32 s54, v39, 15 -; FLATSCR-NEXT: v_readlane_b32 s53, v39, 14 -; FLATSCR-NEXT: v_readlane_b32 s52, v39, 13 -; FLATSCR-NEXT: v_readlane_b32 s51, v39, 12 -; FLATSCR-NEXT: v_readlane_b32 s50, v39, 11 -; FLATSCR-NEXT: v_readlane_b32 s49, v39, 10 -; FLATSCR-NEXT: v_readlane_b32 s48, v39, 9 -; FLATSCR-NEXT: v_readlane_b32 s47, v39, 8 -; FLATSCR-NEXT: v_readlane_b32 s46, v39, 7 -; FLATSCR-NEXT: v_readlane_b32 s45, v39, 6 -; FLATSCR-NEXT: v_readlane_b32 s44, v39, 5 -; FLATSCR-NEXT: v_readlane_b32 s43, v39, 4 -; FLATSCR-NEXT: v_readlane_b32 s42, v39, 3 -; FLATSCR-NEXT: v_readlane_b32 s41, v39, 2 -; FLATSCR-NEXT: v_readlane_b32 s40, v39, 1 +; FLATSCR-NEXT: v_readlane_b32 s102, v39, 31 +; FLATSCR-NEXT: v_readlane_b32 s101, v39, 30 +; FLATSCR-NEXT: v_readlane_b32 s100, v39, 29 +; FLATSCR-NEXT: v_readlane_b32 s99, v39, 28 +; FLATSCR-NEXT: v_readlane_b32 s98, v39, 27 +; FLATSCR-NEXT: v_readlane_b32 s97, v39, 26 +; FLATSCR-NEXT: v_readlane_b32 s96, v39, 25 +; FLATSCR-NEXT: v_readlane_b32 s87, v39, 24 +; FLATSCR-NEXT: v_readlane_b32 s86, v39, 23 +; FLATSCR-NEXT: v_readlane_b32 s85, v39, 22 +; FLATSCR-NEXT: v_readlane_b32 s84, v39, 21 +; FLATSCR-NEXT: v_readlane_b32 s83, v39, 20 +; FLATSCR-NEXT: v_readlane_b32 s82, v39, 19 +; FLATSCR-NEXT: v_readlane_b32 s81, v39, 18 +; FLATSCR-NEXT: v_readlane_b32 s80, v39, 17 +; FLATSCR-NEXT: v_readlane_b32 s71, v39, 16 +; FLATSCR-NEXT: v_readlane_b32 s70, v39, 15 +; FLATSCR-NEXT: v_readlane_b32 s69, v39, 14 +; FLATSCR-NEXT: v_readlane_b32 s68, v39, 13 +; FLATSCR-NEXT: v_readlane_b32 s67, v39, 12 +; FLATSCR-NEXT: v_readlane_b32 s66, v39, 11 +; FLATSCR-NEXT: v_readlane_b32 s65, v39, 10 +; FLATSCR-NEXT: v_readlane_b32 s64, v39, 9 +; FLATSCR-NEXT: v_readlane_b32 s55, v39, 8 +; FLATSCR-NEXT: v_readlane_b32 s54, v39, 7 +; FLATSCR-NEXT: v_readlane_b32 s53, v39, 6 +; FLATSCR-NEXT: v_readlane_b32 s52, v39, 5 +; FLATSCR-NEXT: v_readlane_b32 s51, v39, 4 +; FLATSCR-NEXT: v_readlane_b32 s50, v39, 3 +; FLATSCR-NEXT: v_readlane_b32 s49, v39, 2 +; FLATSCR-NEXT: v_readlane_b32 s48, v39, 1 ; FLATSCR-NEXT: v_readlane_b32 s39, v39, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 @@ -2037,149 +1517,83 @@ define void @callee_need_to_spill_fp_to_reg() #1 { ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] -; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 -; MUBUF-NEXT: s_addk_i32 s32, 0x300 +; MUBUF-NEXT: v_writelane_b32 v40, s4, 32 +; MUBUF-NEXT: s_addk_i32 s32, 0x200 ; MUBUF-NEXT: v_writelane_b32 v40, s39, 0 -; MUBUF-NEXT: v_writelane_b32 v40, s40, 1 -; MUBUF-NEXT: v_writelane_b32 v40, s41, 2 -; MUBUF-NEXT: v_writelane_b32 v40, s42, 3 -; MUBUF-NEXT: v_writelane_b32 v40, s43, 4 -; MUBUF-NEXT: v_writelane_b32 v40, s44, 5 -; MUBUF-NEXT: v_writelane_b32 v40, s45, 6 -; MUBUF-NEXT: v_writelane_b32 v40, s46, 7 -; MUBUF-NEXT: v_writelane_b32 v40, s47, 8 -; MUBUF-NEXT: v_writelane_b32 v40, s48, 9 -; MUBUF-NEXT: v_writelane_b32 v40, s49, 10 -; MUBUF-NEXT: v_writelane_b32 v40, s50, 11 -; MUBUF-NEXT: v_writelane_b32 v40, s51, 12 -; MUBUF-NEXT: v_writelane_b32 v40, s52, 13 -; MUBUF-NEXT: v_writelane_b32 v40, s53, 14 -; MUBUF-NEXT: v_writelane_b32 v40, s54, 15 -; MUBUF-NEXT: v_writelane_b32 v40, s55, 16 -; MUBUF-NEXT: v_writelane_b32 v40, s56, 17 -; MUBUF-NEXT: v_writelane_b32 v40, s57, 18 -; MUBUF-NEXT: v_writelane_b32 v40, s58, 19 -; MUBUF-NEXT: v_writelane_b32 v40, s59, 20 -; MUBUF-NEXT: v_writelane_b32 v40, s60, 21 -; MUBUF-NEXT: v_writelane_b32 v40, s61, 22 -; MUBUF-NEXT: v_writelane_b32 v40, s62, 23 -; MUBUF-NEXT: v_writelane_b32 v40, s63, 24 -; MUBUF-NEXT: v_writelane_b32 v40, s64, 25 -; MUBUF-NEXT: v_writelane_b32 v40, s65, 26 -; MUBUF-NEXT: v_writelane_b32 v40, s66, 27 -; MUBUF-NEXT: v_writelane_b32 v40, s67, 28 -; MUBUF-NEXT: v_writelane_b32 v40, s68, 29 -; MUBUF-NEXT: v_writelane_b32 v40, s69, 30 -; MUBUF-NEXT: v_writelane_b32 v40, s70, 31 -; MUBUF-NEXT: v_writelane_b32 v40, s71, 32 -; MUBUF-NEXT: v_writelane_b32 v40, s72, 33 -; MUBUF-NEXT: v_writelane_b32 v40, s73, 34 -; MUBUF-NEXT: v_writelane_b32 v40, s74, 35 -; MUBUF-NEXT: v_writelane_b32 v40, s75, 36 -; MUBUF-NEXT: v_writelane_b32 v40, s76, 37 -; MUBUF-NEXT: v_writelane_b32 v40, s77, 38 -; MUBUF-NEXT: v_writelane_b32 v40, s78, 39 -; MUBUF-NEXT: v_writelane_b32 v40, s79, 40 -; MUBUF-NEXT: v_writelane_b32 v40, s80, 41 -; MUBUF-NEXT: v_writelane_b32 v40, s81, 42 -; MUBUF-NEXT: v_writelane_b32 v40, s82, 43 -; MUBUF-NEXT: v_writelane_b32 v40, s83, 44 -; MUBUF-NEXT: v_writelane_b32 v40, s84, 45 -; MUBUF-NEXT: v_writelane_b32 v40, s85, 46 -; MUBUF-NEXT: v_writelane_b32 v40, s86, 47 -; MUBUF-NEXT: v_writelane_b32 v40, s87, 48 -; MUBUF-NEXT: v_writelane_b32 v40, s88, 49 -; MUBUF-NEXT: v_writelane_b32 v40, s89, 50 -; MUBUF-NEXT: v_writelane_b32 v40, s90, 51 -; MUBUF-NEXT: v_writelane_b32 v40, s91, 52 -; MUBUF-NEXT: v_writelane_b32 v40, s92, 53 -; MUBUF-NEXT: v_writelane_b32 v40, s93, 54 -; MUBUF-NEXT: v_writelane_b32 v40, s94, 55 -; MUBUF-NEXT: v_writelane_b32 v40, s95, 56 -; MUBUF-NEXT: v_writelane_b32 v40, s96, 57 -; MUBUF-NEXT: v_writelane_b32 v40, s97, 58 -; MUBUF-NEXT: v_writelane_b32 v40, s98, 59 -; MUBUF-NEXT: v_writelane_b32 v40, s99, 60 -; MUBUF-NEXT: v_writelane_b32 v40, s100, 61 -; MUBUF-NEXT: v_writelane_b32 v40, s101, 62 -; MUBUF-NEXT: v_writelane_b32 v40, s102, 63 +; MUBUF-NEXT: v_writelane_b32 v40, s48, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s49, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s50, 3 +; MUBUF-NEXT: v_writelane_b32 v40, s51, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s52, 5 +; MUBUF-NEXT: v_writelane_b32 v40, s53, 6 +; MUBUF-NEXT: v_writelane_b32 v40, s54, 7 +; MUBUF-NEXT: v_writelane_b32 v40, s55, 8 +; MUBUF-NEXT: v_writelane_b32 v40, s64, 9 +; MUBUF-NEXT: v_writelane_b32 v40, s65, 10 +; MUBUF-NEXT: v_writelane_b32 v40, s66, 11 +; MUBUF-NEXT: v_writelane_b32 v40, s67, 12 +; MUBUF-NEXT: v_writelane_b32 v40, s68, 13 +; MUBUF-NEXT: v_writelane_b32 v40, s69, 14 +; MUBUF-NEXT: v_writelane_b32 v40, s70, 15 +; MUBUF-NEXT: v_writelane_b32 v40, s71, 16 +; MUBUF-NEXT: v_writelane_b32 v40, s80, 17 +; MUBUF-NEXT: v_writelane_b32 v40, s81, 18 +; MUBUF-NEXT: v_writelane_b32 v40, s82, 19 +; MUBUF-NEXT: v_writelane_b32 v40, s83, 20 +; MUBUF-NEXT: v_writelane_b32 v40, s84, 21 +; MUBUF-NEXT: v_writelane_b32 v40, s85, 22 +; MUBUF-NEXT: v_writelane_b32 v40, s86, 23 +; MUBUF-NEXT: v_writelane_b32 v40, s87, 24 +; MUBUF-NEXT: v_writelane_b32 v40, s96, 25 +; MUBUF-NEXT: v_writelane_b32 v40, s97, 26 +; MUBUF-NEXT: v_writelane_b32 v40, s98, 27 +; MUBUF-NEXT: v_writelane_b32 v40, s99, 28 +; MUBUF-NEXT: v_writelane_b32 v40, s100, 29 +; MUBUF-NEXT: v_writelane_b32 v40, s101, 30 +; MUBUF-NEXT: v_writelane_b32 v40, s102, 31 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber all VGPRs except CSR v40 ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_readlane_b32 s102, v40, 63 -; MUBUF-NEXT: v_readlane_b32 s101, v40, 62 -; MUBUF-NEXT: v_readlane_b32 s100, v40, 61 -; MUBUF-NEXT: v_readlane_b32 s99, v40, 60 -; MUBUF-NEXT: v_readlane_b32 s98, v40, 59 -; MUBUF-NEXT: v_readlane_b32 s97, v40, 58 -; MUBUF-NEXT: v_readlane_b32 s96, v40, 57 -; MUBUF-NEXT: v_readlane_b32 s95, v40, 56 -; MUBUF-NEXT: v_readlane_b32 s94, v40, 55 -; MUBUF-NEXT: v_readlane_b32 s93, v40, 54 -; MUBUF-NEXT: v_readlane_b32 s92, v40, 53 -; MUBUF-NEXT: v_readlane_b32 s91, v40, 52 -; MUBUF-NEXT: v_readlane_b32 s90, v40, 51 -; MUBUF-NEXT: v_readlane_b32 s89, v40, 50 -; MUBUF-NEXT: v_readlane_b32 s88, v40, 49 -; MUBUF-NEXT: v_readlane_b32 s87, v40, 48 -; MUBUF-NEXT: v_readlane_b32 s86, v40, 47 -; MUBUF-NEXT: v_readlane_b32 s85, v40, 46 -; MUBUF-NEXT: v_readlane_b32 s84, v40, 45 -; MUBUF-NEXT: v_readlane_b32 s83, v40, 44 -; MUBUF-NEXT: v_readlane_b32 s82, v40, 43 -; MUBUF-NEXT: v_readlane_b32 s81, v40, 42 -; MUBUF-NEXT: v_readlane_b32 s80, v40, 41 -; MUBUF-NEXT: v_readlane_b32 s79, v40, 40 -; MUBUF-NEXT: v_readlane_b32 s78, v40, 39 -; MUBUF-NEXT: v_readlane_b32 s77, v40, 38 -; MUBUF-NEXT: v_readlane_b32 s76, v40, 37 -; MUBUF-NEXT: v_readlane_b32 s75, v40, 36 -; MUBUF-NEXT: v_readlane_b32 s74, v40, 35 -; MUBUF-NEXT: v_readlane_b32 s73, v40, 34 -; MUBUF-NEXT: v_readlane_b32 s72, v40, 33 -; MUBUF-NEXT: v_readlane_b32 s71, v40, 32 -; MUBUF-NEXT: v_readlane_b32 s70, v40, 31 -; MUBUF-NEXT: v_readlane_b32 s69, v40, 30 -; MUBUF-NEXT: v_readlane_b32 s68, v40, 29 -; MUBUF-NEXT: v_readlane_b32 s67, v40, 28 -; MUBUF-NEXT: v_readlane_b32 s66, v40, 27 -; MUBUF-NEXT: v_readlane_b32 s65, v40, 26 -; MUBUF-NEXT: v_readlane_b32 s64, v40, 25 -; MUBUF-NEXT: v_readlane_b32 s63, v40, 24 -; MUBUF-NEXT: v_readlane_b32 s62, v40, 23 -; MUBUF-NEXT: v_readlane_b32 s61, v40, 22 -; MUBUF-NEXT: v_readlane_b32 s60, v40, 21 -; MUBUF-NEXT: v_readlane_b32 s59, v40, 20 -; MUBUF-NEXT: v_readlane_b32 s58, v40, 19 -; MUBUF-NEXT: v_readlane_b32 s57, v40, 18 -; MUBUF-NEXT: v_readlane_b32 s56, v40, 17 -; MUBUF-NEXT: v_readlane_b32 s55, v40, 16 -; MUBUF-NEXT: v_readlane_b32 s54, v40, 15 -; MUBUF-NEXT: v_readlane_b32 s53, v40, 14 -; MUBUF-NEXT: v_readlane_b32 s52, v40, 13 -; MUBUF-NEXT: v_readlane_b32 s51, v40, 12 -; MUBUF-NEXT: v_readlane_b32 s50, v40, 11 -; MUBUF-NEXT: v_readlane_b32 s49, v40, 10 -; MUBUF-NEXT: v_readlane_b32 s48, v40, 9 -; MUBUF-NEXT: v_readlane_b32 s47, v40, 8 -; MUBUF-NEXT: v_readlane_b32 s46, v40, 7 -; MUBUF-NEXT: v_readlane_b32 s45, v40, 6 -; MUBUF-NEXT: v_readlane_b32 s44, v40, 5 -; MUBUF-NEXT: v_readlane_b32 s43, v40, 4 -; MUBUF-NEXT: v_readlane_b32 s42, v40, 3 -; MUBUF-NEXT: v_readlane_b32 s41, v40, 2 -; MUBUF-NEXT: v_readlane_b32 s40, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s102, v40, 31 +; MUBUF-NEXT: v_readlane_b32 s101, v40, 30 +; MUBUF-NEXT: v_readlane_b32 s100, v40, 29 +; MUBUF-NEXT: v_readlane_b32 s99, v40, 28 +; MUBUF-NEXT: v_readlane_b32 s98, v40, 27 +; MUBUF-NEXT: v_readlane_b32 s97, v40, 26 +; MUBUF-NEXT: v_readlane_b32 s96, v40, 25 +; MUBUF-NEXT: v_readlane_b32 s87, v40, 24 +; MUBUF-NEXT: v_readlane_b32 s86, v40, 23 +; MUBUF-NEXT: v_readlane_b32 s85, v40, 22 +; MUBUF-NEXT: v_readlane_b32 s84, v40, 21 +; MUBUF-NEXT: v_readlane_b32 s83, v40, 20 +; MUBUF-NEXT: v_readlane_b32 s82, v40, 19 +; MUBUF-NEXT: v_readlane_b32 s81, v40, 18 +; MUBUF-NEXT: v_readlane_b32 s80, v40, 17 +; MUBUF-NEXT: v_readlane_b32 s71, v40, 16 +; MUBUF-NEXT: v_readlane_b32 s70, v40, 15 +; MUBUF-NEXT: v_readlane_b32 s69, v40, 14 +; MUBUF-NEXT: v_readlane_b32 s68, v40, 13 +; MUBUF-NEXT: v_readlane_b32 s67, v40, 12 +; MUBUF-NEXT: v_readlane_b32 s66, v40, 11 +; MUBUF-NEXT: v_readlane_b32 s65, v40, 10 +; MUBUF-NEXT: v_readlane_b32 s64, v40, 9 +; MUBUF-NEXT: v_readlane_b32 s55, v40, 8 +; MUBUF-NEXT: v_readlane_b32 s54, v40, 7 +; MUBUF-NEXT: v_readlane_b32 s53, v40, 6 +; MUBUF-NEXT: v_readlane_b32 s52, v40, 5 +; MUBUF-NEXT: v_readlane_b32 s51, v40, 4 +; MUBUF-NEXT: v_readlane_b32 s50, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s49, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s48, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s39, v40, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 -; MUBUF-NEXT: v_readlane_b32 s4, v41, 0 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 32 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: s_mov_b32 s33, s4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -2195,138 +1609,74 @@ define void @callee_need_to_spill_fp_to_reg() #1 { ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_add_i32 s32, s32, 8 ; FLATSCR-NEXT: v_writelane_b32 v40, s39, 0 -; FLATSCR-NEXT: v_writelane_b32 v40, s40, 1 -; FLATSCR-NEXT: v_writelane_b32 v40, s41, 2 -; FLATSCR-NEXT: v_writelane_b32 v40, s42, 3 -; FLATSCR-NEXT: v_writelane_b32 v40, s43, 4 -; FLATSCR-NEXT: v_writelane_b32 v40, s44, 5 -; FLATSCR-NEXT: v_writelane_b32 v40, s45, 6 -; FLATSCR-NEXT: v_writelane_b32 v40, s46, 7 -; FLATSCR-NEXT: v_writelane_b32 v40, s47, 8 -; FLATSCR-NEXT: v_writelane_b32 v40, s48, 9 -; FLATSCR-NEXT: v_writelane_b32 v40, s49, 10 -; FLATSCR-NEXT: v_writelane_b32 v40, s50, 11 -; FLATSCR-NEXT: v_writelane_b32 v40, s51, 12 -; FLATSCR-NEXT: v_writelane_b32 v40, s52, 13 -; FLATSCR-NEXT: v_writelane_b32 v40, s53, 14 -; FLATSCR-NEXT: v_writelane_b32 v40, s54, 15 -; FLATSCR-NEXT: v_writelane_b32 v40, s55, 16 -; FLATSCR-NEXT: v_writelane_b32 v40, s56, 17 -; FLATSCR-NEXT: v_writelane_b32 v40, s57, 18 -; FLATSCR-NEXT: v_writelane_b32 v40, s58, 19 -; FLATSCR-NEXT: v_writelane_b32 v40, s59, 20 -; FLATSCR-NEXT: v_writelane_b32 v40, s60, 21 -; FLATSCR-NEXT: v_writelane_b32 v40, s61, 22 -; FLATSCR-NEXT: v_writelane_b32 v40, s62, 23 -; FLATSCR-NEXT: v_writelane_b32 v40, s63, 24 -; FLATSCR-NEXT: v_writelane_b32 v40, s64, 25 -; FLATSCR-NEXT: v_writelane_b32 v40, s65, 26 -; FLATSCR-NEXT: v_writelane_b32 v40, s66, 27 -; FLATSCR-NEXT: v_writelane_b32 v40, s67, 28 -; FLATSCR-NEXT: v_writelane_b32 v40, s68, 29 -; FLATSCR-NEXT: v_writelane_b32 v40, s69, 30 -; FLATSCR-NEXT: v_writelane_b32 v40, s70, 31 -; FLATSCR-NEXT: v_writelane_b32 v40, s71, 32 -; FLATSCR-NEXT: v_writelane_b32 v40, s72, 33 -; FLATSCR-NEXT: v_writelane_b32 v40, s73, 34 -; FLATSCR-NEXT: v_writelane_b32 v40, s74, 35 -; FLATSCR-NEXT: v_writelane_b32 v40, s75, 36 -; FLATSCR-NEXT: v_writelane_b32 v40, s76, 37 -; FLATSCR-NEXT: v_writelane_b32 v40, s77, 38 -; FLATSCR-NEXT: v_writelane_b32 v40, s78, 39 -; FLATSCR-NEXT: v_writelane_b32 v40, s79, 40 -; FLATSCR-NEXT: v_writelane_b32 v40, s80, 41 -; FLATSCR-NEXT: v_writelane_b32 v40, s81, 42 -; FLATSCR-NEXT: v_writelane_b32 v40, s82, 43 -; FLATSCR-NEXT: v_writelane_b32 v40, s83, 44 -; FLATSCR-NEXT: v_writelane_b32 v40, s84, 45 -; FLATSCR-NEXT: v_writelane_b32 v40, s85, 46 -; FLATSCR-NEXT: v_writelane_b32 v40, s86, 47 -; FLATSCR-NEXT: v_writelane_b32 v40, s87, 48 -; FLATSCR-NEXT: v_writelane_b32 v40, s88, 49 -; FLATSCR-NEXT: v_writelane_b32 v40, s89, 50 -; FLATSCR-NEXT: v_writelane_b32 v40, s90, 51 -; FLATSCR-NEXT: v_writelane_b32 v40, s91, 52 -; FLATSCR-NEXT: v_writelane_b32 v40, s92, 53 -; FLATSCR-NEXT: v_writelane_b32 v40, s93, 54 -; FLATSCR-NEXT: v_writelane_b32 v40, s94, 55 -; FLATSCR-NEXT: v_writelane_b32 v40, s95, 56 -; FLATSCR-NEXT: v_writelane_b32 v40, s96, 57 -; FLATSCR-NEXT: v_writelane_b32 v40, s97, 58 -; FLATSCR-NEXT: v_writelane_b32 v40, s98, 59 -; FLATSCR-NEXT: v_writelane_b32 v40, s99, 60 -; FLATSCR-NEXT: v_writelane_b32 v40, s100, 61 -; FLATSCR-NEXT: v_writelane_b32 v40, s101, 62 -; FLATSCR-NEXT: v_writelane_b32 v40, s102, 63 +; FLATSCR-NEXT: v_writelane_b32 v40, s48, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s49, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s50, 3 +; FLATSCR-NEXT: v_writelane_b32 v40, s51, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s52, 5 +; FLATSCR-NEXT: v_writelane_b32 v40, s53, 6 +; FLATSCR-NEXT: v_writelane_b32 v40, s54, 7 +; FLATSCR-NEXT: v_writelane_b32 v40, s55, 8 +; FLATSCR-NEXT: v_writelane_b32 v40, s64, 9 +; FLATSCR-NEXT: v_writelane_b32 v40, s65, 10 +; FLATSCR-NEXT: v_writelane_b32 v40, s66, 11 +; FLATSCR-NEXT: v_writelane_b32 v40, s67, 12 +; FLATSCR-NEXT: v_writelane_b32 v40, s68, 13 +; FLATSCR-NEXT: v_writelane_b32 v40, s69, 14 +; FLATSCR-NEXT: v_writelane_b32 v40, s70, 15 +; FLATSCR-NEXT: v_writelane_b32 v40, s71, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s80, 17 +; FLATSCR-NEXT: v_writelane_b32 v40, s81, 18 +; FLATSCR-NEXT: v_writelane_b32 v40, s82, 19 +; FLATSCR-NEXT: v_writelane_b32 v40, s83, 20 +; FLATSCR-NEXT: v_writelane_b32 v40, s84, 21 +; FLATSCR-NEXT: v_writelane_b32 v40, s85, 22 +; FLATSCR-NEXT: v_writelane_b32 v40, s86, 23 +; FLATSCR-NEXT: v_writelane_b32 v40, s87, 24 +; FLATSCR-NEXT: v_writelane_b32 v40, s96, 25 +; FLATSCR-NEXT: v_writelane_b32 v40, s97, 26 +; FLATSCR-NEXT: v_writelane_b32 v40, s98, 27 +; FLATSCR-NEXT: v_writelane_b32 v40, s99, 28 +; FLATSCR-NEXT: v_writelane_b32 v40, s100, 29 +; FLATSCR-NEXT: v_writelane_b32 v40, s101, 30 +; FLATSCR-NEXT: v_writelane_b32 v40, s102, 31 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber all VGPRs except CSR v40 ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s102, v40, 63 -; FLATSCR-NEXT: v_readlane_b32 s101, v40, 62 -; FLATSCR-NEXT: v_readlane_b32 s100, v40, 61 -; FLATSCR-NEXT: v_readlane_b32 s99, v40, 60 -; FLATSCR-NEXT: v_readlane_b32 s98, v40, 59 -; FLATSCR-NEXT: v_readlane_b32 s97, v40, 58 -; FLATSCR-NEXT: v_readlane_b32 s96, v40, 57 -; FLATSCR-NEXT: v_readlane_b32 s95, v40, 56 -; FLATSCR-NEXT: v_readlane_b32 s94, v40, 55 -; FLATSCR-NEXT: v_readlane_b32 s93, v40, 54 -; FLATSCR-NEXT: v_readlane_b32 s92, v40, 53 -; FLATSCR-NEXT: v_readlane_b32 s91, v40, 52 -; FLATSCR-NEXT: v_readlane_b32 s90, v40, 51 -; FLATSCR-NEXT: v_readlane_b32 s89, v40, 50 -; FLATSCR-NEXT: v_readlane_b32 s88, v40, 49 -; FLATSCR-NEXT: v_readlane_b32 s87, v40, 48 -; FLATSCR-NEXT: v_readlane_b32 s86, v40, 47 -; FLATSCR-NEXT: v_readlane_b32 s85, v40, 46 -; FLATSCR-NEXT: v_readlane_b32 s84, v40, 45 -; FLATSCR-NEXT: v_readlane_b32 s83, v40, 44 -; FLATSCR-NEXT: v_readlane_b32 s82, v40, 43 -; FLATSCR-NEXT: v_readlane_b32 s81, v40, 42 -; FLATSCR-NEXT: v_readlane_b32 s80, v40, 41 -; FLATSCR-NEXT: v_readlane_b32 s79, v40, 40 -; FLATSCR-NEXT: v_readlane_b32 s78, v40, 39 -; FLATSCR-NEXT: v_readlane_b32 s77, v40, 38 -; FLATSCR-NEXT: v_readlane_b32 s76, v40, 37 -; FLATSCR-NEXT: v_readlane_b32 s75, v40, 36 -; FLATSCR-NEXT: v_readlane_b32 s74, v40, 35 -; FLATSCR-NEXT: v_readlane_b32 s73, v40, 34 -; FLATSCR-NEXT: v_readlane_b32 s72, v40, 33 -; FLATSCR-NEXT: v_readlane_b32 s71, v40, 32 -; FLATSCR-NEXT: v_readlane_b32 s70, v40, 31 -; FLATSCR-NEXT: v_readlane_b32 s69, v40, 30 -; FLATSCR-NEXT: v_readlane_b32 s68, v40, 29 -; FLATSCR-NEXT: v_readlane_b32 s67, v40, 28 -; FLATSCR-NEXT: v_readlane_b32 s66, v40, 27 -; FLATSCR-NEXT: v_readlane_b32 s65, v40, 26 -; FLATSCR-NEXT: v_readlane_b32 s64, v40, 25 -; FLATSCR-NEXT: v_readlane_b32 s63, v40, 24 -; FLATSCR-NEXT: v_readlane_b32 s62, v40, 23 -; FLATSCR-NEXT: v_readlane_b32 s61, v40, 22 -; FLATSCR-NEXT: v_readlane_b32 s60, v40, 21 -; FLATSCR-NEXT: v_readlane_b32 s59, v40, 20 -; FLATSCR-NEXT: v_readlane_b32 s58, v40, 19 -; FLATSCR-NEXT: v_readlane_b32 s57, v40, 18 -; FLATSCR-NEXT: v_readlane_b32 s56, v40, 17 -; FLATSCR-NEXT: v_readlane_b32 s55, v40, 16 -; FLATSCR-NEXT: v_readlane_b32 s54, v40, 15 -; FLATSCR-NEXT: v_readlane_b32 s53, v40, 14 -; FLATSCR-NEXT: v_readlane_b32 s52, v40, 13 -; FLATSCR-NEXT: v_readlane_b32 s51, v40, 12 -; FLATSCR-NEXT: v_readlane_b32 s50, v40, 11 -; FLATSCR-NEXT: v_readlane_b32 s49, v40, 10 -; FLATSCR-NEXT: v_readlane_b32 s48, v40, 9 -; FLATSCR-NEXT: v_readlane_b32 s47, v40, 8 -; FLATSCR-NEXT: v_readlane_b32 s46, v40, 7 -; FLATSCR-NEXT: v_readlane_b32 s45, v40, 6 -; FLATSCR-NEXT: v_readlane_b32 s44, v40, 5 -; FLATSCR-NEXT: v_readlane_b32 s43, v40, 4 -; FLATSCR-NEXT: v_readlane_b32 s42, v40, 3 -; FLATSCR-NEXT: v_readlane_b32 s41, v40, 2 -; FLATSCR-NEXT: v_readlane_b32 s40, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s102, v40, 31 +; FLATSCR-NEXT: v_readlane_b32 s101, v40, 30 +; FLATSCR-NEXT: v_readlane_b32 s100, v40, 29 +; FLATSCR-NEXT: v_readlane_b32 s99, v40, 28 +; FLATSCR-NEXT: v_readlane_b32 s98, v40, 27 +; FLATSCR-NEXT: v_readlane_b32 s97, v40, 26 +; FLATSCR-NEXT: v_readlane_b32 s96, v40, 25 +; FLATSCR-NEXT: v_readlane_b32 s87, v40, 24 +; FLATSCR-NEXT: v_readlane_b32 s86, v40, 23 +; FLATSCR-NEXT: v_readlane_b32 s85, v40, 22 +; FLATSCR-NEXT: v_readlane_b32 s84, v40, 21 +; FLATSCR-NEXT: v_readlane_b32 s83, v40, 20 +; FLATSCR-NEXT: v_readlane_b32 s82, v40, 19 +; FLATSCR-NEXT: v_readlane_b32 s81, v40, 18 +; FLATSCR-NEXT: v_readlane_b32 s80, v40, 17 +; FLATSCR-NEXT: v_readlane_b32 s71, v40, 16 +; FLATSCR-NEXT: v_readlane_b32 s70, v40, 15 +; FLATSCR-NEXT: v_readlane_b32 s69, v40, 14 +; FLATSCR-NEXT: v_readlane_b32 s68, v40, 13 +; FLATSCR-NEXT: v_readlane_b32 s67, v40, 12 +; FLATSCR-NEXT: v_readlane_b32 s66, v40, 11 +; FLATSCR-NEXT: v_readlane_b32 s65, v40, 10 +; FLATSCR-NEXT: v_readlane_b32 s64, v40, 9 +; FLATSCR-NEXT: v_readlane_b32 s55, v40, 8 +; FLATSCR-NEXT: v_readlane_b32 s54, v40, 7 +; FLATSCR-NEXT: v_readlane_b32 s53, v40, 6 +; FLATSCR-NEXT: v_readlane_b32 s52, v40, 5 +; FLATSCR-NEXT: v_readlane_b32 s51, v40, 4 +; FLATSCR-NEXT: v_readlane_b32 s50, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s49, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s48, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s39, v40, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -2367,74 +1717,40 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5) ; MUBUF-NEXT: s_add_i32 s5, s33, 0x40100 ; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] -; MUBUF-NEXT: v_mov_b32_e32 v0, s4 -; MUBUF-NEXT: s_add_i32 s5, s33, 0x40200 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; MUBUF-NEXT: s_add_i32 s32, s32, 0x40400 +; MUBUF-NEXT: v_writelane_b32 v39, s4, 32 +; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300 ; MUBUF-NEXT: v_writelane_b32 v39, s39, 0 -; MUBUF-NEXT: v_writelane_b32 v39, s40, 1 -; MUBUF-NEXT: v_writelane_b32 v39, s41, 2 -; MUBUF-NEXT: v_writelane_b32 v39, s42, 3 -; MUBUF-NEXT: v_writelane_b32 v39, s43, 4 -; MUBUF-NEXT: v_writelane_b32 v39, s44, 5 -; MUBUF-NEXT: v_writelane_b32 v39, s45, 6 -; MUBUF-NEXT: v_writelane_b32 v39, s46, 7 -; MUBUF-NEXT: v_writelane_b32 v39, s47, 8 -; MUBUF-NEXT: v_writelane_b32 v39, s48, 9 -; MUBUF-NEXT: v_writelane_b32 v39, s49, 10 -; MUBUF-NEXT: v_writelane_b32 v39, s50, 11 -; MUBUF-NEXT: v_writelane_b32 v39, s51, 12 -; MUBUF-NEXT: v_writelane_b32 v39, s52, 13 -; MUBUF-NEXT: v_writelane_b32 v39, s53, 14 -; MUBUF-NEXT: v_writelane_b32 v39, s54, 15 -; MUBUF-NEXT: v_writelane_b32 v39, s55, 16 -; MUBUF-NEXT: v_writelane_b32 v39, s56, 17 -; MUBUF-NEXT: v_writelane_b32 v39, s57, 18 -; MUBUF-NEXT: v_writelane_b32 v39, s58, 19 -; MUBUF-NEXT: v_writelane_b32 v39, s59, 20 -; MUBUF-NEXT: v_writelane_b32 v39, s60, 21 -; MUBUF-NEXT: v_writelane_b32 v39, s61, 22 -; MUBUF-NEXT: v_writelane_b32 v39, s62, 23 -; MUBUF-NEXT: v_writelane_b32 v39, s63, 24 -; MUBUF-NEXT: v_writelane_b32 v39, s64, 25 -; MUBUF-NEXT: v_writelane_b32 v39, s65, 26 -; MUBUF-NEXT: v_writelane_b32 v39, s66, 27 -; MUBUF-NEXT: v_writelane_b32 v39, s67, 28 -; MUBUF-NEXT: v_writelane_b32 v39, s68, 29 -; MUBUF-NEXT: v_writelane_b32 v39, s69, 30 -; MUBUF-NEXT: v_writelane_b32 v39, s70, 31 -; MUBUF-NEXT: v_writelane_b32 v39, s71, 32 -; MUBUF-NEXT: v_writelane_b32 v39, s72, 33 -; MUBUF-NEXT: v_writelane_b32 v39, s73, 34 -; MUBUF-NEXT: v_writelane_b32 v39, s74, 35 -; MUBUF-NEXT: v_writelane_b32 v39, s75, 36 -; MUBUF-NEXT: v_writelane_b32 v39, s76, 37 -; MUBUF-NEXT: v_writelane_b32 v39, s77, 38 -; MUBUF-NEXT: v_writelane_b32 v39, s78, 39 -; MUBUF-NEXT: v_writelane_b32 v39, s79, 40 -; MUBUF-NEXT: v_writelane_b32 v39, s80, 41 -; MUBUF-NEXT: v_writelane_b32 v39, s81, 42 -; MUBUF-NEXT: v_writelane_b32 v39, s82, 43 -; MUBUF-NEXT: v_writelane_b32 v39, s83, 44 -; MUBUF-NEXT: v_writelane_b32 v39, s84, 45 -; MUBUF-NEXT: v_writelane_b32 v39, s85, 46 -; MUBUF-NEXT: v_writelane_b32 v39, s86, 47 -; MUBUF-NEXT: v_writelane_b32 v39, s87, 48 -; MUBUF-NEXT: v_writelane_b32 v39, s88, 49 -; MUBUF-NEXT: v_writelane_b32 v39, s89, 50 -; MUBUF-NEXT: v_writelane_b32 v39, s90, 51 -; MUBUF-NEXT: v_writelane_b32 v39, s91, 52 -; MUBUF-NEXT: v_writelane_b32 v39, s92, 53 -; MUBUF-NEXT: v_writelane_b32 v39, s93, 54 -; MUBUF-NEXT: v_writelane_b32 v39, s94, 55 -; MUBUF-NEXT: v_writelane_b32 v39, s95, 56 -; MUBUF-NEXT: v_writelane_b32 v39, s96, 57 -; MUBUF-NEXT: v_writelane_b32 v39, s97, 58 -; MUBUF-NEXT: v_writelane_b32 v39, s98, 59 -; MUBUF-NEXT: v_writelane_b32 v39, s99, 60 -; MUBUF-NEXT: v_writelane_b32 v39, s100, 61 -; MUBUF-NEXT: v_writelane_b32 v39, s101, 62 -; MUBUF-NEXT: v_writelane_b32 v39, s102, 63 +; MUBUF-NEXT: v_writelane_b32 v39, s48, 1 +; MUBUF-NEXT: v_writelane_b32 v39, s49, 2 +; MUBUF-NEXT: v_writelane_b32 v39, s50, 3 +; MUBUF-NEXT: v_writelane_b32 v39, s51, 4 +; MUBUF-NEXT: v_writelane_b32 v39, s52, 5 +; MUBUF-NEXT: v_writelane_b32 v39, s53, 6 +; MUBUF-NEXT: v_writelane_b32 v39, s54, 7 +; MUBUF-NEXT: v_writelane_b32 v39, s55, 8 +; MUBUF-NEXT: v_writelane_b32 v39, s64, 9 +; MUBUF-NEXT: v_writelane_b32 v39, s65, 10 +; MUBUF-NEXT: v_writelane_b32 v39, s66, 11 +; MUBUF-NEXT: v_writelane_b32 v39, s67, 12 +; MUBUF-NEXT: v_writelane_b32 v39, s68, 13 +; MUBUF-NEXT: v_writelane_b32 v39, s69, 14 +; MUBUF-NEXT: v_writelane_b32 v39, s70, 15 +; MUBUF-NEXT: v_writelane_b32 v39, s71, 16 +; MUBUF-NEXT: v_writelane_b32 v39, s80, 17 +; MUBUF-NEXT: v_writelane_b32 v39, s81, 18 +; MUBUF-NEXT: v_writelane_b32 v39, s82, 19 +; MUBUF-NEXT: v_writelane_b32 v39, s83, 20 +; MUBUF-NEXT: v_writelane_b32 v39, s84, 21 +; MUBUF-NEXT: v_writelane_b32 v39, s85, 22 +; MUBUF-NEXT: v_writelane_b32 v39, s86, 23 +; MUBUF-NEXT: v_writelane_b32 v39, s87, 24 +; MUBUF-NEXT: v_writelane_b32 v39, s96, 25 +; MUBUF-NEXT: v_writelane_b32 v39, s97, 26 +; MUBUF-NEXT: v_writelane_b32 v39, s98, 27 +; MUBUF-NEXT: v_writelane_b32 v39, s99, 28 +; MUBUF-NEXT: v_writelane_b32 v39, s100, 29 +; MUBUF-NEXT: v_writelane_b32 v39, s101, 30 +; MUBUF-NEXT: v_writelane_b32 v39, s102, 31 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1000 ; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen @@ -2445,75 +1761,40 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber all VGPRs except CSR v40 ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: s_add_i32 s5, s33, 0x40200 -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; MUBUF-NEXT: v_readlane_b32 s102, v39, 63 -; MUBUF-NEXT: v_readlane_b32 s101, v39, 62 -; MUBUF-NEXT: v_readlane_b32 s100, v39, 61 -; MUBUF-NEXT: v_readlane_b32 s99, v39, 60 -; MUBUF-NEXT: v_readlane_b32 s98, v39, 59 -; MUBUF-NEXT: v_readlane_b32 s97, v39, 58 -; MUBUF-NEXT: v_readlane_b32 s96, v39, 57 -; MUBUF-NEXT: v_readlane_b32 s95, v39, 56 -; MUBUF-NEXT: v_readlane_b32 s94, v39, 55 -; MUBUF-NEXT: v_readlane_b32 s93, v39, 54 -; MUBUF-NEXT: v_readlane_b32 s92, v39, 53 -; MUBUF-NEXT: v_readlane_b32 s91, v39, 52 -; MUBUF-NEXT: v_readlane_b32 s90, v39, 51 -; MUBUF-NEXT: v_readlane_b32 s89, v39, 50 -; MUBUF-NEXT: v_readlane_b32 s88, v39, 49 -; MUBUF-NEXT: v_readlane_b32 s87, v39, 48 -; MUBUF-NEXT: v_readlane_b32 s86, v39, 47 -; MUBUF-NEXT: v_readlane_b32 s85, v39, 46 -; MUBUF-NEXT: v_readlane_b32 s84, v39, 45 -; MUBUF-NEXT: v_readlane_b32 s83, v39, 44 -; MUBUF-NEXT: v_readlane_b32 s82, v39, 43 -; MUBUF-NEXT: v_readlane_b32 s81, v39, 42 -; MUBUF-NEXT: v_readlane_b32 s80, v39, 41 -; MUBUF-NEXT: v_readlane_b32 s79, v39, 40 -; MUBUF-NEXT: v_readlane_b32 s78, v39, 39 -; MUBUF-NEXT: v_readlane_b32 s77, v39, 38 -; MUBUF-NEXT: v_readlane_b32 s76, v39, 37 -; MUBUF-NEXT: v_readlane_b32 s75, v39, 36 -; MUBUF-NEXT: v_readlane_b32 s74, v39, 35 -; MUBUF-NEXT: v_readlane_b32 s73, v39, 34 -; MUBUF-NEXT: v_readlane_b32 s72, v39, 33 -; MUBUF-NEXT: v_readlane_b32 s71, v39, 32 -; MUBUF-NEXT: v_readlane_b32 s70, v39, 31 -; MUBUF-NEXT: v_readlane_b32 s69, v39, 30 -; MUBUF-NEXT: v_readlane_b32 s68, v39, 29 -; MUBUF-NEXT: v_readlane_b32 s67, v39, 28 -; MUBUF-NEXT: v_readlane_b32 s66, v39, 27 -; MUBUF-NEXT: v_readlane_b32 s65, v39, 26 -; MUBUF-NEXT: v_readlane_b32 s64, v39, 25 -; MUBUF-NEXT: v_readlane_b32 s63, v39, 24 -; MUBUF-NEXT: v_readlane_b32 s62, v39, 23 -; MUBUF-NEXT: v_readlane_b32 s61, v39, 22 -; MUBUF-NEXT: v_readlane_b32 s60, v39, 21 -; MUBUF-NEXT: v_readlane_b32 s59, v39, 20 -; MUBUF-NEXT: v_readlane_b32 s58, v39, 19 -; MUBUF-NEXT: v_readlane_b32 s57, v39, 18 -; MUBUF-NEXT: v_readlane_b32 s56, v39, 17 -; MUBUF-NEXT: v_readlane_b32 s55, v39, 16 -; MUBUF-NEXT: v_readlane_b32 s54, v39, 15 -; MUBUF-NEXT: v_readlane_b32 s53, v39, 14 -; MUBUF-NEXT: v_readlane_b32 s52, v39, 13 -; MUBUF-NEXT: v_readlane_b32 s51, v39, 12 -; MUBUF-NEXT: v_readlane_b32 s50, v39, 11 -; MUBUF-NEXT: v_readlane_b32 s49, v39, 10 -; MUBUF-NEXT: v_readlane_b32 s48, v39, 9 -; MUBUF-NEXT: v_readlane_b32 s47, v39, 8 -; MUBUF-NEXT: v_readlane_b32 s46, v39, 7 -; MUBUF-NEXT: v_readlane_b32 s45, v39, 6 -; MUBUF-NEXT: v_readlane_b32 s44, v39, 5 -; MUBUF-NEXT: v_readlane_b32 s43, v39, 4 -; MUBUF-NEXT: v_readlane_b32 s42, v39, 3 -; MUBUF-NEXT: v_readlane_b32 s41, v39, 2 -; MUBUF-NEXT: v_readlane_b32 s40, v39, 1 +; MUBUF-NEXT: v_readlane_b32 s102, v39, 31 +; MUBUF-NEXT: v_readlane_b32 s101, v39, 30 +; MUBUF-NEXT: v_readlane_b32 s100, v39, 29 +; MUBUF-NEXT: v_readlane_b32 s99, v39, 28 +; MUBUF-NEXT: v_readlane_b32 s98, v39, 27 +; MUBUF-NEXT: v_readlane_b32 s97, v39, 26 +; MUBUF-NEXT: v_readlane_b32 s96, v39, 25 +; MUBUF-NEXT: v_readlane_b32 s87, v39, 24 +; MUBUF-NEXT: v_readlane_b32 s86, v39, 23 +; MUBUF-NEXT: v_readlane_b32 s85, v39, 22 +; MUBUF-NEXT: v_readlane_b32 s84, v39, 21 +; MUBUF-NEXT: v_readlane_b32 s83, v39, 20 +; MUBUF-NEXT: v_readlane_b32 s82, v39, 19 +; MUBUF-NEXT: v_readlane_b32 s81, v39, 18 +; MUBUF-NEXT: v_readlane_b32 s80, v39, 17 +; MUBUF-NEXT: v_readlane_b32 s71, v39, 16 +; MUBUF-NEXT: v_readlane_b32 s70, v39, 15 +; MUBUF-NEXT: v_readlane_b32 s69, v39, 14 +; MUBUF-NEXT: v_readlane_b32 s68, v39, 13 +; MUBUF-NEXT: v_readlane_b32 s67, v39, 12 +; MUBUF-NEXT: v_readlane_b32 s66, v39, 11 +; MUBUF-NEXT: v_readlane_b32 s65, v39, 10 +; MUBUF-NEXT: v_readlane_b32 s64, v39, 9 +; MUBUF-NEXT: v_readlane_b32 s55, v39, 8 +; MUBUF-NEXT: v_readlane_b32 s54, v39, 7 +; MUBUF-NEXT: v_readlane_b32 s53, v39, 6 +; MUBUF-NEXT: v_readlane_b32 s52, v39, 5 +; MUBUF-NEXT: v_readlane_b32 s51, v39, 4 +; MUBUF-NEXT: v_readlane_b32 s50, v39, 3 +; MUBUF-NEXT: v_readlane_b32 s49, v39, 2 +; MUBUF-NEXT: v_readlane_b32 s48, v39, 1 ; MUBUF-NEXT: v_readlane_b32 s39, v39, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 -; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_readfirstlane_b32 s4, v0 +; MUBUF-NEXT: v_readlane_b32 s4, v39, 32 ; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: s_add_i32 s5, s33, 0x40100 ; MUBUF-NEXT: buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload @@ -2533,69 +1814,37 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5) ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_addk_i32 s32, 0x100c ; FLATSCR-NEXT: v_writelane_b32 v39, s39, 0 -; FLATSCR-NEXT: v_writelane_b32 v39, s40, 1 -; FLATSCR-NEXT: v_writelane_b32 v39, s41, 2 -; FLATSCR-NEXT: v_writelane_b32 v39, s42, 3 -; FLATSCR-NEXT: v_writelane_b32 v39, s43, 4 -; FLATSCR-NEXT: v_writelane_b32 v39, s44, 5 -; FLATSCR-NEXT: v_writelane_b32 v39, s45, 6 -; FLATSCR-NEXT: v_writelane_b32 v39, s46, 7 -; FLATSCR-NEXT: v_writelane_b32 v39, s47, 8 -; FLATSCR-NEXT: v_writelane_b32 v39, s48, 9 -; FLATSCR-NEXT: v_writelane_b32 v39, s49, 10 -; FLATSCR-NEXT: v_writelane_b32 v39, s50, 11 -; FLATSCR-NEXT: v_writelane_b32 v39, s51, 12 -; FLATSCR-NEXT: v_writelane_b32 v39, s52, 13 -; FLATSCR-NEXT: v_writelane_b32 v39, s53, 14 -; FLATSCR-NEXT: v_writelane_b32 v39, s54, 15 -; FLATSCR-NEXT: v_writelane_b32 v39, s55, 16 -; FLATSCR-NEXT: v_writelane_b32 v39, s56, 17 -; FLATSCR-NEXT: v_writelane_b32 v39, s57, 18 -; FLATSCR-NEXT: v_writelane_b32 v39, s58, 19 -; FLATSCR-NEXT: v_writelane_b32 v39, s59, 20 -; FLATSCR-NEXT: v_writelane_b32 v39, s60, 21 -; FLATSCR-NEXT: v_writelane_b32 v39, s61, 22 -; FLATSCR-NEXT: v_writelane_b32 v39, s62, 23 -; FLATSCR-NEXT: v_writelane_b32 v39, s63, 24 -; FLATSCR-NEXT: v_writelane_b32 v39, s64, 25 -; FLATSCR-NEXT: v_writelane_b32 v39, s65, 26 -; FLATSCR-NEXT: v_writelane_b32 v39, s66, 27 -; FLATSCR-NEXT: v_writelane_b32 v39, s67, 28 -; FLATSCR-NEXT: v_writelane_b32 v39, s68, 29 -; FLATSCR-NEXT: v_writelane_b32 v39, s69, 30 -; FLATSCR-NEXT: v_writelane_b32 v39, s70, 31 -; FLATSCR-NEXT: v_writelane_b32 v39, s71, 32 -; FLATSCR-NEXT: v_writelane_b32 v39, s72, 33 -; FLATSCR-NEXT: v_writelane_b32 v39, s73, 34 -; FLATSCR-NEXT: v_writelane_b32 v39, s74, 35 -; FLATSCR-NEXT: v_writelane_b32 v39, s75, 36 -; FLATSCR-NEXT: v_writelane_b32 v39, s76, 37 -; FLATSCR-NEXT: v_writelane_b32 v39, s77, 38 -; FLATSCR-NEXT: v_writelane_b32 v39, s78, 39 -; FLATSCR-NEXT: v_writelane_b32 v39, s79, 40 -; FLATSCR-NEXT: v_writelane_b32 v39, s80, 41 -; FLATSCR-NEXT: v_writelane_b32 v39, s81, 42 -; FLATSCR-NEXT: v_writelane_b32 v39, s82, 43 -; FLATSCR-NEXT: v_writelane_b32 v39, s83, 44 -; FLATSCR-NEXT: v_writelane_b32 v39, s84, 45 -; FLATSCR-NEXT: v_writelane_b32 v39, s85, 46 -; FLATSCR-NEXT: v_writelane_b32 v39, s86, 47 -; FLATSCR-NEXT: v_writelane_b32 v39, s87, 48 -; FLATSCR-NEXT: v_writelane_b32 v39, s88, 49 -; FLATSCR-NEXT: v_writelane_b32 v39, s89, 50 -; FLATSCR-NEXT: v_writelane_b32 v39, s90, 51 -; FLATSCR-NEXT: v_writelane_b32 v39, s91, 52 -; FLATSCR-NEXT: v_writelane_b32 v39, s92, 53 -; FLATSCR-NEXT: v_writelane_b32 v39, s93, 54 -; FLATSCR-NEXT: v_writelane_b32 v39, s94, 55 -; FLATSCR-NEXT: v_writelane_b32 v39, s95, 56 -; FLATSCR-NEXT: v_writelane_b32 v39, s96, 57 -; FLATSCR-NEXT: v_writelane_b32 v39, s97, 58 -; FLATSCR-NEXT: v_writelane_b32 v39, s98, 59 -; FLATSCR-NEXT: v_writelane_b32 v39, s99, 60 -; FLATSCR-NEXT: v_writelane_b32 v39, s100, 61 -; FLATSCR-NEXT: v_writelane_b32 v39, s101, 62 -; FLATSCR-NEXT: v_writelane_b32 v39, s102, 63 +; FLATSCR-NEXT: v_writelane_b32 v39, s48, 1 +; FLATSCR-NEXT: v_writelane_b32 v39, s49, 2 +; FLATSCR-NEXT: v_writelane_b32 v39, s50, 3 +; FLATSCR-NEXT: v_writelane_b32 v39, s51, 4 +; FLATSCR-NEXT: v_writelane_b32 v39, s52, 5 +; FLATSCR-NEXT: v_writelane_b32 v39, s53, 6 +; FLATSCR-NEXT: v_writelane_b32 v39, s54, 7 +; FLATSCR-NEXT: v_writelane_b32 v39, s55, 8 +; FLATSCR-NEXT: v_writelane_b32 v39, s64, 9 +; FLATSCR-NEXT: v_writelane_b32 v39, s65, 10 +; FLATSCR-NEXT: v_writelane_b32 v39, s66, 11 +; FLATSCR-NEXT: v_writelane_b32 v39, s67, 12 +; FLATSCR-NEXT: v_writelane_b32 v39, s68, 13 +; FLATSCR-NEXT: v_writelane_b32 v39, s69, 14 +; FLATSCR-NEXT: v_writelane_b32 v39, s70, 15 +; FLATSCR-NEXT: v_writelane_b32 v39, s71, 16 +; FLATSCR-NEXT: v_writelane_b32 v39, s80, 17 +; FLATSCR-NEXT: v_writelane_b32 v39, s81, 18 +; FLATSCR-NEXT: v_writelane_b32 v39, s82, 19 +; FLATSCR-NEXT: v_writelane_b32 v39, s83, 20 +; FLATSCR-NEXT: v_writelane_b32 v39, s84, 21 +; FLATSCR-NEXT: v_writelane_b32 v39, s85, 22 +; FLATSCR-NEXT: v_writelane_b32 v39, s86, 23 +; FLATSCR-NEXT: v_writelane_b32 v39, s87, 24 +; FLATSCR-NEXT: v_writelane_b32 v39, s96, 25 +; FLATSCR-NEXT: v_writelane_b32 v39, s97, 26 +; FLATSCR-NEXT: v_writelane_b32 v39, s98, 27 +; FLATSCR-NEXT: v_writelane_b32 v39, s99, 28 +; FLATSCR-NEXT: v_writelane_b32 v39, s100, 29 +; FLATSCR-NEXT: v_writelane_b32 v39, s101, 30 +; FLATSCR-NEXT: v_writelane_b32 v39, s102, 31 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: s_add_i32 s1, s33, 0x1000 ; FLATSCR-NEXT: scratch_store_dword off, v0, s1 @@ -2606,69 +1855,37 @@ define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5) ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber all VGPRs except CSR v40 ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s102, v39, 63 -; FLATSCR-NEXT: v_readlane_b32 s101, v39, 62 -; FLATSCR-NEXT: v_readlane_b32 s100, v39, 61 -; FLATSCR-NEXT: v_readlane_b32 s99, v39, 60 -; FLATSCR-NEXT: v_readlane_b32 s98, v39, 59 -; FLATSCR-NEXT: v_readlane_b32 s97, v39, 58 -; FLATSCR-NEXT: v_readlane_b32 s96, v39, 57 -; FLATSCR-NEXT: v_readlane_b32 s95, v39, 56 -; FLATSCR-NEXT: v_readlane_b32 s94, v39, 55 -; FLATSCR-NEXT: v_readlane_b32 s93, v39, 54 -; FLATSCR-NEXT: v_readlane_b32 s92, v39, 53 -; FLATSCR-NEXT: v_readlane_b32 s91, v39, 52 -; FLATSCR-NEXT: v_readlane_b32 s90, v39, 51 -; FLATSCR-NEXT: v_readlane_b32 s89, v39, 50 -; FLATSCR-NEXT: v_readlane_b32 s88, v39, 49 -; FLATSCR-NEXT: v_readlane_b32 s87, v39, 48 -; FLATSCR-NEXT: v_readlane_b32 s86, v39, 47 -; FLATSCR-NEXT: v_readlane_b32 s85, v39, 46 -; FLATSCR-NEXT: v_readlane_b32 s84, v39, 45 -; FLATSCR-NEXT: v_readlane_b32 s83, v39, 44 -; FLATSCR-NEXT: v_readlane_b32 s82, v39, 43 -; FLATSCR-NEXT: v_readlane_b32 s81, v39, 42 -; FLATSCR-NEXT: v_readlane_b32 s80, v39, 41 -; FLATSCR-NEXT: v_readlane_b32 s79, v39, 40 -; FLATSCR-NEXT: v_readlane_b32 s78, v39, 39 -; FLATSCR-NEXT: v_readlane_b32 s77, v39, 38 -; FLATSCR-NEXT: v_readlane_b32 s76, v39, 37 -; FLATSCR-NEXT: v_readlane_b32 s75, v39, 36 -; FLATSCR-NEXT: v_readlane_b32 s74, v39, 35 -; FLATSCR-NEXT: v_readlane_b32 s73, v39, 34 -; FLATSCR-NEXT: v_readlane_b32 s72, v39, 33 -; FLATSCR-NEXT: v_readlane_b32 s71, v39, 32 -; FLATSCR-NEXT: v_readlane_b32 s70, v39, 31 -; FLATSCR-NEXT: v_readlane_b32 s69, v39, 30 -; FLATSCR-NEXT: v_readlane_b32 s68, v39, 29 -; FLATSCR-NEXT: v_readlane_b32 s67, v39, 28 -; FLATSCR-NEXT: v_readlane_b32 s66, v39, 27 -; FLATSCR-NEXT: v_readlane_b32 s65, v39, 26 -; FLATSCR-NEXT: v_readlane_b32 s64, v39, 25 -; FLATSCR-NEXT: v_readlane_b32 s63, v39, 24 -; FLATSCR-NEXT: v_readlane_b32 s62, v39, 23 -; FLATSCR-NEXT: v_readlane_b32 s61, v39, 22 -; FLATSCR-NEXT: v_readlane_b32 s60, v39, 21 -; FLATSCR-NEXT: v_readlane_b32 s59, v39, 20 -; FLATSCR-NEXT: v_readlane_b32 s58, v39, 19 -; FLATSCR-NEXT: v_readlane_b32 s57, v39, 18 -; FLATSCR-NEXT: v_readlane_b32 s56, v39, 17 -; FLATSCR-NEXT: v_readlane_b32 s55, v39, 16 -; FLATSCR-NEXT: v_readlane_b32 s54, v39, 15 -; FLATSCR-NEXT: v_readlane_b32 s53, v39, 14 -; FLATSCR-NEXT: v_readlane_b32 s52, v39, 13 -; FLATSCR-NEXT: v_readlane_b32 s51, v39, 12 -; FLATSCR-NEXT: v_readlane_b32 s50, v39, 11 -; FLATSCR-NEXT: v_readlane_b32 s49, v39, 10 -; FLATSCR-NEXT: v_readlane_b32 s48, v39, 9 -; FLATSCR-NEXT: v_readlane_b32 s47, v39, 8 -; FLATSCR-NEXT: v_readlane_b32 s46, v39, 7 -; FLATSCR-NEXT: v_readlane_b32 s45, v39, 6 -; FLATSCR-NEXT: v_readlane_b32 s44, v39, 5 -; FLATSCR-NEXT: v_readlane_b32 s43, v39, 4 -; FLATSCR-NEXT: v_readlane_b32 s42, v39, 3 -; FLATSCR-NEXT: v_readlane_b32 s41, v39, 2 -; FLATSCR-NEXT: v_readlane_b32 s40, v39, 1 +; FLATSCR-NEXT: v_readlane_b32 s102, v39, 31 +; FLATSCR-NEXT: v_readlane_b32 s101, v39, 30 +; FLATSCR-NEXT: v_readlane_b32 s100, v39, 29 +; FLATSCR-NEXT: v_readlane_b32 s99, v39, 28 +; FLATSCR-NEXT: v_readlane_b32 s98, v39, 27 +; FLATSCR-NEXT: v_readlane_b32 s97, v39, 26 +; FLATSCR-NEXT: v_readlane_b32 s96, v39, 25 +; FLATSCR-NEXT: v_readlane_b32 s87, v39, 24 +; FLATSCR-NEXT: v_readlane_b32 s86, v39, 23 +; FLATSCR-NEXT: v_readlane_b32 s85, v39, 22 +; FLATSCR-NEXT: v_readlane_b32 s84, v39, 21 +; FLATSCR-NEXT: v_readlane_b32 s83, v39, 20 +; FLATSCR-NEXT: v_readlane_b32 s82, v39, 19 +; FLATSCR-NEXT: v_readlane_b32 s81, v39, 18 +; FLATSCR-NEXT: v_readlane_b32 s80, v39, 17 +; FLATSCR-NEXT: v_readlane_b32 s71, v39, 16 +; FLATSCR-NEXT: v_readlane_b32 s70, v39, 15 +; FLATSCR-NEXT: v_readlane_b32 s69, v39, 14 +; FLATSCR-NEXT: v_readlane_b32 s68, v39, 13 +; FLATSCR-NEXT: v_readlane_b32 s67, v39, 12 +; FLATSCR-NEXT: v_readlane_b32 s66, v39, 11 +; FLATSCR-NEXT: v_readlane_b32 s65, v39, 10 +; FLATSCR-NEXT: v_readlane_b32 s64, v39, 9 +; FLATSCR-NEXT: v_readlane_b32 s55, v39, 8 +; FLATSCR-NEXT: v_readlane_b32 s54, v39, 7 +; FLATSCR-NEXT: v_readlane_b32 s53, v39, 6 +; FLATSCR-NEXT: v_readlane_b32 s52, v39, 5 +; FLATSCR-NEXT: v_readlane_b32 s51, v39, 4 +; FLATSCR-NEXT: v_readlane_b32 s50, v39, 3 +; FLATSCR-NEXT: v_readlane_b32 s49, v39, 2 +; FLATSCR-NEXT: v_readlane_b32 s48, v39, 1 ; FLATSCR-NEXT: v_readlane_b32 s39, v39, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll new file mode 100644 index 0000000000000..9558d9f0bc4c9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a | FileCheck %s + +define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: copy_to_reg_frameindex: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: .LBB0_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_cmp_lt_u32 0, 16 +; CHECK-NEXT: s_set_gpr_idx_on 0, gpr_idx(DST) +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_set_gpr_idx_off +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %done +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm +entry: + %B = srem i32 %c, -1 + br label %loop + +loop: + %promotealloca = phi <16 x i32> [ undef, %entry ], [ %0, %loop ] + %inc = phi i32 [ 0, %entry ], [ %inc.i, %loop ] + %0 = insertelement <16 x i32> %promotealloca, i32 %inc, i32 %inc + %inc.i = add i32 %inc, %B + %cnd = icmp uge i32 %inc.i, 16 + br i1 %cnd, label %done, label %loop + +done: + %1 = extractelement <16 x i32> %0, i32 0 + store i32 %1, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index fdc9704a3784e..a01c2fa152ab3 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -337,8 +337,7 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 0x800000 ; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc ; GFX9-NEXT: v_ldexp_f32 v3, |v0|, v3 ; GFX9-NEXT: v_log_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir index ff713000f6985..209ac8e811456 100644 --- a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir +++ b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir @@ -14,23 +14,13 @@ body: | ; CHECK-LABEL: name: def_csr_sgpr ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr42, $sgpr43, $sgpr46, $sgpr47 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr0 - ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr0, 0 - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr42, 0, $vgpr0 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr42, $vgpr0, 0, 32 - ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr43, 1, $vgpr0 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr43, $vgpr0, 1, 32 - ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr46, 2, $vgpr0 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr46, $vgpr0, 2, 32 - ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr47, 3, $vgpr0 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr47, $vgpr0, 3, 32 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index 935ae48654b64..993f162921663 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -1278,84 +1278,44 @@ define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2) } define double @fmul_select_f64_test12(double %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_f64_test12: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v2, v3 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 31, v2 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_f64_test12: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc -; GFX7-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_f64_test12: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v2, v3 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v3, 31, v2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: fmul_select_f64_test12: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v5, 1 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: fmul_select_f64_test12: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v3 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v3, 31, v3 -; GFX10-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_f64_test12: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: fmul_select_f64_test12: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_f64_test12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: fmul_select_f64_test12: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v3 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 31, v3 -; GFX11-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f64_test12: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: fmul_select_f64_test12: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: fmul_select_f64_test12: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double 0.000000e+00, double -0.000000e+00 %ldexp = fmul double %x, %y @@ -3137,11 +3097,11 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX7-LABEL: fmul_select_bf16_test8: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 31, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3149,10 +3109,10 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff8000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 15 -; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3169,8 +3129,7 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_lshlrev_b16 v1, 15, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffff8000, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3186,19 +3145,17 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffff8000, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b16 v1, 15, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 diff --git a/llvm/test/CodeGen/AMDGPU/dbg-value-starts-sched-region.mir b/llvm/test/CodeGen/AMDGPU/dbg-value-starts-sched-region.mir new file mode 100644 index 0000000000000..0785fe31d63b4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dbg-value-starts-sched-region.mir @@ -0,0 +1,27 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-misched -run-pass=machine-scheduler -o - %s | FileCheck %s + +# Verify we maintain live-ins even if the first instruction in sched region is +# DBG_. + +--- +name: sched +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched + ; CHECK: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: SCHED_BARRIER 0 + ; CHECK-NEXT: DBG_VALUE + ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:sgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_ENDPGM 0 + %0:sgpr_32 = IMPLICIT_DEF + S_NOP 0 + SCHED_BARRIER 0 + DBG_VALUE + dead %1:sgpr_32 = COPY %0 + S_NOP 0 + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll index 7397433d2a125..8a315c337d1ee 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll +++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll @@ -1,68 +1,2190 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm -o - -emit-heterogeneous-dwarf-as-user-ops %s | FileCheck --check-prefixes=CHECK,WAVE64,GFX900 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-spill-vgpr-to-agpr=0 -filetype=asm -o - -emit-heterogeneous-dwarf-as-user-ops %s | FileCheck --check-prefixes=CHECK,WAVE64,GFX90A-V2A-DIS %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-spill-vgpr-to-agpr=1 -filetype=asm -o - -emit-heterogeneous-dwarf-as-user-ops %s | FileCheck --check-prefixes=CHECK,WAVE64,GFX90A-V2A-EN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm -o - -emit-heterogeneous-dwarf-as-user-ops %s | FileCheck --check-prefixes=CHECK,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-spill-vgpr-to-agpr=0 -filetype=asm -o - -emit-heterogeneous-dwarf-as-user-ops %s | FileCheck --check-prefixes=CHECK,GFX90A-V2A-DIS %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-spill-vgpr-to-agpr=1 -filetype=asm -o - -emit-heterogeneous-dwarf-as-user-ops %s | FileCheck --check-prefixes=CHECK,GFX90A-V2A-EN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -filetype=asm -o - -emit-heterogeneous-dwarf-as-user-ops %s | FileCheck --check-prefixes=CHECK,WAVE32 %s -; CHECK-LABEL: kern1: -; CHECK: .cfi_startproc - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: %bb.0: -; DW_CFA_def_cfa_expression [0x0f] -; BLOCK_LENGTH ULEB128(3)=[0x04] -; DW_OP_lit0 [0x30] -; DW_OP_lit6 [0x36] -; DW_OP_LLVM_user [0xe9] -; DW_OP_LLVM_form_aspace_address [0x02] -; CHECK-NEXT: .cfi_escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 -; PC_64 = 16 -; CHECK-NEXT: .cfi_undefined 16 - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: .cfi_endproc define protected amdgpu_kernel void @kern1() #0 { +; CHECK-LABEL: kern1: +; CHECK: .Lfunc_begin0: +; CHECK-NEXT: .cfi_sections .debug_frame +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: ; %bb.0: ; %entry +; CHECK-NEXT: .cfi_escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 ; +; CHECK-NEXT: .cfi_undefined 16 +; CHECK-NEXT: s_endpgm entry: ret void } -; CHECK-LABEL: func_no_clobber: -; CHECK: .cfi_startproc - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: %bb.0: -; SGPR32 = 64 -; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 -; CHECK-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: .cfi_endproc define hidden void @func_no_clobber() #0 { +; CHECK-LABEL: func_no_clobber: +; CHECK: .Lfunc_begin1: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: ; %bb.0: ; %entry +; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; CHECK-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] entry: ret void } -; CHECK-LABEL: {{^}}callee_need_to_spill_fp_to_memory: -; CHECK: .cfi_startproc - -; SGPR33 = 65 -; CHECK: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; CHECK: s_mov_b32 s33, s32 -; CHECK: v_mov_b32_e32 [[TMP_VGPR:v[0-9]+]], [[FP_SCRATCH_COPY]] -; GFX900: buffer_store_dword [[TMP_VGPR]], off, s[0:3], s33 offset:448 ; 4-byte Folded Spill -; GFX90A-V2A-DIS: buffer_store_dword [[TMP_VGPR]], off, s[0:3], s33 offset:448 ; 4-byte Folded Spill -; GFX90A-V2A-EN: buffer_store_dword [[TMP_VGPR]], off, s[0:3], s33 offset:320 ; 4-byte Folded Spill - -; GFX900: .cfi_offset 65, 28672 -; GFX90A-V2A-DIS: .cfi_offset 65, 28672 -; GFX90A-V2A-EN: .cfi_offset 65, 20480 -; WAVE32: .cfi_offset 65, 14336 - -; CHECK: .cfi_endproc define void @callee_need_to_spill_fp_to_memory() #1 { +; GFX900-LABEL: callee_need_to_spill_fp_to_memory: +; GFX900: .Lfunc_begin2: +; GFX900-NEXT: .cfi_startproc +; GFX900-NEXT: ; %bb.0: +; GFX900-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GFX900-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; GFX900-NEXT: .cfi_undefined 2560 +; GFX900-NEXT: .cfi_undefined 2561 +; GFX900-NEXT: .cfi_undefined 2562 +; GFX900-NEXT: .cfi_undefined 2563 +; GFX900-NEXT: .cfi_undefined 2564 +; GFX900-NEXT: .cfi_undefined 2565 +; GFX900-NEXT: .cfi_undefined 2566 +; GFX900-NEXT: .cfi_undefined 2567 +; GFX900-NEXT: .cfi_undefined 2568 +; GFX900-NEXT: .cfi_undefined 2569 +; GFX900-NEXT: .cfi_undefined 2570 +; GFX900-NEXT: .cfi_undefined 2571 +; GFX900-NEXT: .cfi_undefined 2572 +; GFX900-NEXT: .cfi_undefined 2573 +; GFX900-NEXT: .cfi_undefined 2574 +; GFX900-NEXT: .cfi_undefined 2575 +; GFX900-NEXT: .cfi_undefined 2576 +; GFX900-NEXT: .cfi_undefined 2577 +; GFX900-NEXT: .cfi_undefined 2578 +; GFX900-NEXT: .cfi_undefined 2579 +; GFX900-NEXT: .cfi_undefined 2580 +; GFX900-NEXT: .cfi_undefined 2581 +; GFX900-NEXT: .cfi_undefined 2582 +; GFX900-NEXT: .cfi_undefined 2583 +; GFX900-NEXT: .cfi_undefined 2584 +; GFX900-NEXT: .cfi_undefined 2585 +; GFX900-NEXT: .cfi_undefined 2586 +; GFX900-NEXT: .cfi_undefined 2587 +; GFX900-NEXT: .cfi_undefined 2588 +; GFX900-NEXT: .cfi_undefined 2589 +; GFX900-NEXT: .cfi_undefined 2590 +; GFX900-NEXT: .cfi_undefined 2591 +; GFX900-NEXT: .cfi_undefined 2592 +; GFX900-NEXT: .cfi_undefined 2593 +; GFX900-NEXT: .cfi_undefined 2594 +; GFX900-NEXT: .cfi_undefined 2595 +; GFX900-NEXT: .cfi_undefined 2596 +; GFX900-NEXT: .cfi_undefined 2597 +; GFX900-NEXT: .cfi_undefined 2598 +; GFX900-NEXT: .cfi_undefined 2599 +; GFX900-NEXT: .cfi_undefined 2608 +; GFX900-NEXT: .cfi_undefined 2609 +; GFX900-NEXT: .cfi_undefined 2610 +; GFX900-NEXT: .cfi_undefined 2611 +; GFX900-NEXT: .cfi_undefined 2612 +; GFX900-NEXT: .cfi_undefined 2613 +; GFX900-NEXT: .cfi_undefined 2614 +; GFX900-NEXT: .cfi_undefined 2615 +; GFX900-NEXT: .cfi_undefined 2624 +; GFX900-NEXT: .cfi_undefined 2625 +; GFX900-NEXT: .cfi_undefined 2626 +; GFX900-NEXT: .cfi_undefined 2627 +; GFX900-NEXT: .cfi_undefined 2628 +; GFX900-NEXT: .cfi_undefined 2629 +; GFX900-NEXT: .cfi_undefined 2630 +; GFX900-NEXT: .cfi_undefined 2631 +; GFX900-NEXT: .cfi_undefined 2640 +; GFX900-NEXT: .cfi_undefined 2641 +; GFX900-NEXT: .cfi_undefined 2642 +; GFX900-NEXT: .cfi_undefined 2643 +; GFX900-NEXT: .cfi_undefined 2644 +; GFX900-NEXT: .cfi_undefined 2645 +; GFX900-NEXT: .cfi_undefined 2646 +; GFX900-NEXT: .cfi_undefined 2647 +; GFX900-NEXT: .cfi_undefined 2656 +; GFX900-NEXT: .cfi_undefined 2657 +; GFX900-NEXT: .cfi_undefined 2658 +; GFX900-NEXT: .cfi_undefined 2659 +; GFX900-NEXT: .cfi_undefined 2660 +; GFX900-NEXT: .cfi_undefined 2661 +; GFX900-NEXT: .cfi_undefined 2662 +; GFX900-NEXT: .cfi_undefined 2663 +; GFX900-NEXT: .cfi_undefined 2672 +; GFX900-NEXT: .cfi_undefined 2673 +; GFX900-NEXT: .cfi_undefined 2674 +; GFX900-NEXT: .cfi_undefined 2675 +; GFX900-NEXT: .cfi_undefined 2676 +; GFX900-NEXT: .cfi_undefined 2677 +; GFX900-NEXT: .cfi_undefined 2678 +; GFX900-NEXT: .cfi_undefined 2679 +; GFX900-NEXT: .cfi_undefined 2688 +; GFX900-NEXT: .cfi_undefined 2689 +; GFX900-NEXT: .cfi_undefined 2690 +; GFX900-NEXT: .cfi_undefined 2691 +; GFX900-NEXT: .cfi_undefined 2692 +; GFX900-NEXT: .cfi_undefined 2693 +; GFX900-NEXT: .cfi_undefined 2694 +; GFX900-NEXT: .cfi_undefined 2695 +; GFX900-NEXT: .cfi_undefined 2704 +; GFX900-NEXT: .cfi_undefined 2705 +; GFX900-NEXT: .cfi_undefined 2706 +; GFX900-NEXT: .cfi_undefined 2707 +; GFX900-NEXT: .cfi_undefined 2708 +; GFX900-NEXT: .cfi_undefined 2709 +; GFX900-NEXT: .cfi_undefined 2710 +; GFX900-NEXT: .cfi_undefined 2711 +; GFX900-NEXT: .cfi_undefined 2720 +; GFX900-NEXT: .cfi_undefined 2721 +; GFX900-NEXT: .cfi_undefined 2722 +; GFX900-NEXT: .cfi_undefined 2723 +; GFX900-NEXT: .cfi_undefined 2724 +; GFX900-NEXT: .cfi_undefined 2725 +; GFX900-NEXT: .cfi_undefined 2726 +; GFX900-NEXT: .cfi_undefined 2727 +; GFX900-NEXT: .cfi_undefined 2736 +; GFX900-NEXT: .cfi_undefined 2737 +; GFX900-NEXT: .cfi_undefined 2738 +; GFX900-NEXT: .cfi_undefined 2739 +; GFX900-NEXT: .cfi_undefined 2740 +; GFX900-NEXT: .cfi_undefined 2741 +; GFX900-NEXT: .cfi_undefined 2742 +; GFX900-NEXT: .cfi_undefined 2743 +; GFX900-NEXT: .cfi_undefined 2752 +; GFX900-NEXT: .cfi_undefined 2753 +; GFX900-NEXT: .cfi_undefined 2754 +; GFX900-NEXT: .cfi_undefined 2755 +; GFX900-NEXT: .cfi_undefined 2756 +; GFX900-NEXT: .cfi_undefined 2757 +; GFX900-NEXT: .cfi_undefined 2758 +; GFX900-NEXT: .cfi_undefined 2759 +; GFX900-NEXT: .cfi_undefined 2768 +; GFX900-NEXT: .cfi_undefined 2769 +; GFX900-NEXT: .cfi_undefined 2770 +; GFX900-NEXT: .cfi_undefined 2771 +; GFX900-NEXT: .cfi_undefined 2772 +; GFX900-NEXT: .cfi_undefined 2773 +; GFX900-NEXT: .cfi_undefined 2774 +; GFX900-NEXT: .cfi_undefined 2775 +; GFX900-NEXT: .cfi_undefined 2784 +; GFX900-NEXT: .cfi_undefined 2785 +; GFX900-NEXT: .cfi_undefined 2786 +; GFX900-NEXT: .cfi_undefined 2787 +; GFX900-NEXT: .cfi_undefined 2788 +; GFX900-NEXT: .cfi_undefined 2789 +; GFX900-NEXT: .cfi_undefined 2790 +; GFX900-NEXT: .cfi_undefined 2791 +; GFX900-NEXT: .cfi_undefined 2800 +; GFX900-NEXT: .cfi_undefined 2801 +; GFX900-NEXT: .cfi_undefined 2802 +; GFX900-NEXT: .cfi_undefined 2803 +; GFX900-NEXT: .cfi_undefined 2804 +; GFX900-NEXT: .cfi_undefined 2805 +; GFX900-NEXT: .cfi_undefined 2806 +; GFX900-NEXT: .cfi_undefined 2807 +; GFX900-NEXT: .cfi_undefined 36 +; GFX900-NEXT: .cfi_undefined 37 +; GFX900-NEXT: .cfi_undefined 38 +; GFX900-NEXT: .cfi_undefined 39 +; GFX900-NEXT: .cfi_undefined 40 +; GFX900-NEXT: .cfi_undefined 41 +; GFX900-NEXT: .cfi_undefined 42 +; GFX900-NEXT: .cfi_undefined 43 +; GFX900-NEXT: .cfi_undefined 44 +; GFX900-NEXT: .cfi_undefined 45 +; GFX900-NEXT: .cfi_undefined 46 +; GFX900-NEXT: .cfi_undefined 47 +; GFX900-NEXT: .cfi_undefined 48 +; GFX900-NEXT: .cfi_undefined 49 +; GFX900-NEXT: .cfi_undefined 50 +; GFX900-NEXT: .cfi_undefined 51 +; GFX900-NEXT: .cfi_undefined 52 +; GFX900-NEXT: .cfi_undefined 53 +; GFX900-NEXT: .cfi_undefined 54 +; GFX900-NEXT: .cfi_undefined 55 +; GFX900-NEXT: .cfi_undefined 56 +; GFX900-NEXT: .cfi_undefined 57 +; GFX900-NEXT: .cfi_undefined 58 +; GFX900-NEXT: .cfi_undefined 59 +; GFX900-NEXT: .cfi_undefined 60 +; GFX900-NEXT: .cfi_undefined 61 +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s40, s33 +; GFX900-NEXT: .cfi_register 65, 72 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: .cfi_def_cfa_register 65 +; GFX900-NEXT: s_addk_i32 s32, 0x7100 +; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 28416 +; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 28160 +; GFX900-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2602, 32, 17, 64, 27904 +; GFX900-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2603, 32, 17, 64, 27648 +; GFX900-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2604, 32, 17, 64, 27392 +; GFX900-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2605, 32, 17, 64, 27136 +; GFX900-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2606, 32, 17, 64, 26880 +; GFX900-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2607, 32, 17, 64, 26624 +; GFX900-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2616, 32, 17, 64, 26368 +; GFX900-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2617, 32, 17, 64, 26112 +; GFX900-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2618, 32, 17, 64, 25856 +; GFX900-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2619, 32, 17, 64, 25600 +; GFX900-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2620, 32, 17, 64, 25344 +; GFX900-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2621, 32, 17, 64, 25088 +; GFX900-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2622, 32, 17, 64, 24832 +; GFX900-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2623, 32, 17, 64, 24576 +; GFX900-NEXT: buffer_store_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2632, 32, 17, 64, 24320 +; GFX900-NEXT: buffer_store_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2633, 32, 17, 64, 24064 +; GFX900-NEXT: buffer_store_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2634, 32, 17, 64, 23808 +; GFX900-NEXT: buffer_store_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2635, 32, 17, 64, 23552 +; GFX900-NEXT: buffer_store_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2636, 32, 17, 64, 23296 +; GFX900-NEXT: buffer_store_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2637, 32, 17, 64, 23040 +; GFX900-NEXT: buffer_store_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2638, 32, 17, 64, 22784 +; GFX900-NEXT: buffer_store_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2639, 32, 17, 64, 22528 +; GFX900-NEXT: buffer_store_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2648, 32, 17, 64, 22272 +; GFX900-NEXT: buffer_store_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2649, 32, 17, 64, 22016 +; GFX900-NEXT: buffer_store_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2650, 32, 17, 64, 21760 +; GFX900-NEXT: buffer_store_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2651, 32, 17, 64, 21504 +; GFX900-NEXT: buffer_store_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2652, 32, 17, 64, 21248 +; GFX900-NEXT: buffer_store_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2653, 32, 17, 64, 20992 +; GFX900-NEXT: buffer_store_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2654, 32, 17, 64, 20736 +; GFX900-NEXT: buffer_store_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2655, 32, 17, 64, 20480 +; GFX900-NEXT: buffer_store_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2664, 32, 17, 64, 20224 +; GFX900-NEXT: buffer_store_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2665, 32, 17, 64, 19968 +; GFX900-NEXT: buffer_store_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2666, 32, 17, 64, 19712 +; GFX900-NEXT: buffer_store_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2667, 32, 17, 64, 19456 +; GFX900-NEXT: buffer_store_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2668, 32, 17, 64, 19200 +; GFX900-NEXT: buffer_store_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2669, 32, 17, 64, 18944 +; GFX900-NEXT: buffer_store_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2670, 32, 17, 64, 18688 +; GFX900-NEXT: buffer_store_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2671, 32, 17, 64, 18432 +; GFX900-NEXT: buffer_store_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2680, 32, 17, 64, 18176 +; GFX900-NEXT: buffer_store_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2681, 32, 17, 64, 17920 +; GFX900-NEXT: buffer_store_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2682, 32, 17, 64, 17664 +; GFX900-NEXT: buffer_store_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2683, 32, 17, 64, 17408 +; GFX900-NEXT: buffer_store_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2684, 32, 17, 64, 17152 +; GFX900-NEXT: buffer_store_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2685, 32, 17, 64, 16896 +; GFX900-NEXT: buffer_store_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2686, 32, 17, 64, 16640 +; GFX900-NEXT: buffer_store_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2687, 32, 17, 64, 16384 +; GFX900-NEXT: buffer_store_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2696, 32, 17, 64, 16128 +; GFX900-NEXT: buffer_store_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2697, 32, 17, 64, 15872 +; GFX900-NEXT: buffer_store_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2698, 32, 17, 64, 15616 +; GFX900-NEXT: buffer_store_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2699, 32, 17, 64, 15360 +; GFX900-NEXT: buffer_store_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2700, 32, 17, 64, 15104 +; GFX900-NEXT: buffer_store_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2701, 32, 17, 64, 14848 +; GFX900-NEXT: buffer_store_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2702, 32, 17, 64, 14592 +; GFX900-NEXT: buffer_store_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2703, 32, 17, 64, 14336 +; GFX900-NEXT: buffer_store_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2712, 32, 17, 64, 14080 +; GFX900-NEXT: buffer_store_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2713, 32, 17, 64, 13824 +; GFX900-NEXT: buffer_store_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2714, 32, 17, 64, 13568 +; GFX900-NEXT: buffer_store_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2715, 32, 17, 64, 13312 +; GFX900-NEXT: buffer_store_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2716, 32, 17, 64, 13056 +; GFX900-NEXT: buffer_store_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2717, 32, 17, 64, 12800 +; GFX900-NEXT: buffer_store_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2718, 32, 17, 64, 12544 +; GFX900-NEXT: buffer_store_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2719, 32, 17, 64, 12288 +; GFX900-NEXT: buffer_store_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2728, 32, 17, 64, 12032 +; GFX900-NEXT: buffer_store_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2729, 32, 17, 64, 11776 +; GFX900-NEXT: buffer_store_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2730, 32, 17, 64, 11520 +; GFX900-NEXT: buffer_store_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2731, 32, 17, 64, 11264 +; GFX900-NEXT: buffer_store_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2732, 32, 17, 64, 11008 +; GFX900-NEXT: buffer_store_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2733, 32, 17, 64, 10752 +; GFX900-NEXT: buffer_store_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2734, 32, 17, 64, 10496 +; GFX900-NEXT: buffer_store_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2735, 32, 17, 64, 10240 +; GFX900-NEXT: buffer_store_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2744, 32, 17, 64, 9984 +; GFX900-NEXT: buffer_store_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2745, 32, 17, 64, 9728 +; GFX900-NEXT: buffer_store_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2746, 32, 17, 64, 9472 +; GFX900-NEXT: buffer_store_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2747, 32, 17, 64, 9216 +; GFX900-NEXT: buffer_store_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2748, 32, 17, 64, 8960 +; GFX900-NEXT: buffer_store_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2749, 32, 17, 64, 8704 +; GFX900-NEXT: buffer_store_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2750, 32, 17, 64, 8448 +; GFX900-NEXT: buffer_store_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2751, 32, 17, 64, 8192 +; GFX900-NEXT: buffer_store_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2760, 32, 17, 64, 7936 +; GFX900-NEXT: buffer_store_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2761, 32, 17, 64, 7680 +; GFX900-NEXT: buffer_store_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2762, 32, 17, 64, 7424 +; GFX900-NEXT: buffer_store_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2763, 32, 17, 64, 7168 +; GFX900-NEXT: buffer_store_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2764, 32, 17, 64, 6912 +; GFX900-NEXT: buffer_store_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2765, 32, 17, 64, 6656 +; GFX900-NEXT: buffer_store_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2766, 32, 17, 64, 6400 +; GFX900-NEXT: buffer_store_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2767, 32, 17, 64, 6144 +; GFX900-NEXT: buffer_store_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2776, 32, 17, 64, 5888 +; GFX900-NEXT: buffer_store_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2777, 32, 17, 64, 5632 +; GFX900-NEXT: buffer_store_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2778, 32, 17, 64, 5376 +; GFX900-NEXT: buffer_store_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2779, 32, 17, 64, 5120 +; GFX900-NEXT: buffer_store_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2780, 32, 17, 64, 4864 +; GFX900-NEXT: buffer_store_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2781, 32, 17, 64, 4608 +; GFX900-NEXT: buffer_store_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2782, 32, 17, 64, 4352 +; GFX900-NEXT: buffer_store_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2783, 32, 17, 64, 4096 +; GFX900-NEXT: buffer_store_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2792, 32, 17, 64, 3840 +; GFX900-NEXT: buffer_store_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2793, 32, 17, 64, 3584 +; GFX900-NEXT: buffer_store_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2794, 32, 17, 64, 3328 +; GFX900-NEXT: buffer_store_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2795, 32, 17, 64, 3072 +; GFX900-NEXT: buffer_store_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2796, 32, 17, 64, 2816 +; GFX900-NEXT: buffer_store_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2797, 32, 17, 64, 2560 +; GFX900-NEXT: buffer_store_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2798, 32, 17, 64, 2304 +; GFX900-NEXT: buffer_store_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2799, 32, 17, 64, 2048 +; GFX900-NEXT: buffer_store_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2808, 32, 17, 64, 1792 +; GFX900-NEXT: buffer_store_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2809, 32, 17, 64, 1536 +; GFX900-NEXT: buffer_store_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2810, 32, 17, 64, 1280 +; GFX900-NEXT: buffer_store_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2811, 32, 17, 64, 1024 +; GFX900-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2812, 32, 17, 64, 768 +; GFX900-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2813, 32, 17, 64, 512 +; GFX900-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2814, 32, 17, 64, 256 +; GFX900-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2815, 32, 17, 64, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; clobber nonpreserved SGPRs +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; clobber all VGPRs +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: buffer_load_dword v255, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: .cfi_def_cfa_register 64 +; GFX900-NEXT: s_mov_b32 s33, s40 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-V2A-DIS-LABEL: callee_need_to_spill_fp_to_memory: +; GFX90A-V2A-DIS: .Lfunc_begin2: +; GFX90A-V2A-DIS-NEXT: .cfi_startproc +; GFX90A-V2A-DIS-NEXT: ; %bb.0: +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2560 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2561 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2562 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2563 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2564 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2565 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2566 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2567 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2568 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2569 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2570 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2571 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2572 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2573 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2574 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2575 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2576 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2577 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2578 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2579 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2580 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2581 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2582 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2583 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2584 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2585 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2586 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2587 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2588 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2589 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2590 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2591 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2592 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2593 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2594 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2595 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2596 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2597 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2598 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2599 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2608 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2609 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2610 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2611 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2612 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2613 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2614 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2615 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2624 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2625 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2626 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2627 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2628 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2629 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2630 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2631 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2640 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2641 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2642 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2643 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2644 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2645 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2646 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2647 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2656 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2657 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2658 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2659 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2660 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2661 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2662 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2663 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2672 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2673 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2674 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2675 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2676 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2677 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2678 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2679 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2688 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2689 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2690 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2691 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2692 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2693 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2694 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2695 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2704 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2705 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2706 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2707 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2708 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2709 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2710 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2711 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2720 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2721 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2722 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2723 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2724 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2725 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2726 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2727 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2736 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2737 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2738 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2739 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2740 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2741 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2742 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2743 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2752 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2753 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2754 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2755 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2756 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2757 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2758 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2759 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2768 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2769 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2770 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2771 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2772 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2773 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2774 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2775 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2784 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2785 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2786 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2787 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2788 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2789 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2790 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2791 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2800 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2801 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2802 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2803 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2804 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2805 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2806 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2807 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 36 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 37 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 38 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 39 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 40 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 41 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 42 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 43 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 44 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 45 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 46 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 47 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 48 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 49 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 50 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 51 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 52 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 53 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 54 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 55 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 56 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 57 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 58 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 59 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 60 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 61 +; GFX90A-V2A-DIS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-V2A-DIS-NEXT: s_mov_b32 s40, s33 +; GFX90A-V2A-DIS-NEXT: .cfi_register 65, 72 +; GFX90A-V2A-DIS-NEXT: s_mov_b32 s33, s32 +; GFX90A-V2A-DIS-NEXT: .cfi_def_cfa_register 65 +; GFX90A-V2A-DIS-NEXT: s_addk_i32 s32, 0x7100 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 28416 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 28160 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2602, 32, 17, 64, 27904 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2603, 32, 17, 64, 27648 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2604, 32, 17, 64, 27392 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2605, 32, 17, 64, 27136 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2606, 32, 17, 64, 26880 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2607, 32, 17, 64, 26624 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2616, 32, 17, 64, 26368 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2617, 32, 17, 64, 26112 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2618, 32, 17, 64, 25856 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2619, 32, 17, 64, 25600 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2620, 32, 17, 64, 25344 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2621, 32, 17, 64, 25088 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2622, 32, 17, 64, 24832 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2623, 32, 17, 64, 24576 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2632, 32, 17, 64, 24320 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2633, 32, 17, 64, 24064 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2634, 32, 17, 64, 23808 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2635, 32, 17, 64, 23552 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2636, 32, 17, 64, 23296 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2637, 32, 17, 64, 23040 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2638, 32, 17, 64, 22784 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2639, 32, 17, 64, 22528 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2648, 32, 17, 64, 22272 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2649, 32, 17, 64, 22016 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2650, 32, 17, 64, 21760 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2651, 32, 17, 64, 21504 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2652, 32, 17, 64, 21248 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2653, 32, 17, 64, 20992 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2654, 32, 17, 64, 20736 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2655, 32, 17, 64, 20480 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2664, 32, 17, 64, 20224 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2665, 32, 17, 64, 19968 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2666, 32, 17, 64, 19712 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2667, 32, 17, 64, 19456 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2668, 32, 17, 64, 19200 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2669, 32, 17, 64, 18944 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2670, 32, 17, 64, 18688 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2671, 32, 17, 64, 18432 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2680, 32, 17, 64, 18176 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2681, 32, 17, 64, 17920 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2682, 32, 17, 64, 17664 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2683, 32, 17, 64, 17408 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2684, 32, 17, 64, 17152 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2685, 32, 17, 64, 16896 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2686, 32, 17, 64, 16640 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2687, 32, 17, 64, 16384 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2696, 32, 17, 64, 16128 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2697, 32, 17, 64, 15872 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2698, 32, 17, 64, 15616 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2699, 32, 17, 64, 15360 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2700, 32, 17, 64, 15104 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2701, 32, 17, 64, 14848 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2702, 32, 17, 64, 14592 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2703, 32, 17, 64, 14336 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2712, 32, 17, 64, 14080 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2713, 32, 17, 64, 13824 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2714, 32, 17, 64, 13568 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2715, 32, 17, 64, 13312 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2716, 32, 17, 64, 13056 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2717, 32, 17, 64, 12800 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2718, 32, 17, 64, 12544 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2719, 32, 17, 64, 12288 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2728, 32, 17, 64, 12032 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2729, 32, 17, 64, 11776 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2730, 32, 17, 64, 11520 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2731, 32, 17, 64, 11264 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2732, 32, 17, 64, 11008 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2733, 32, 17, 64, 10752 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2734, 32, 17, 64, 10496 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2735, 32, 17, 64, 10240 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2744, 32, 17, 64, 9984 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2745, 32, 17, 64, 9728 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2746, 32, 17, 64, 9472 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2747, 32, 17, 64, 9216 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2748, 32, 17, 64, 8960 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2749, 32, 17, 64, 8704 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2750, 32, 17, 64, 8448 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2751, 32, 17, 64, 8192 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2760, 32, 17, 64, 7936 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2761, 32, 17, 64, 7680 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2762, 32, 17, 64, 7424 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2763, 32, 17, 64, 7168 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2764, 32, 17, 64, 6912 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2765, 32, 17, 64, 6656 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2766, 32, 17, 64, 6400 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2767, 32, 17, 64, 6144 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2776, 32, 17, 64, 5888 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2777, 32, 17, 64, 5632 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2778, 32, 17, 64, 5376 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2779, 32, 17, 64, 5120 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2780, 32, 17, 64, 4864 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2781, 32, 17, 64, 4608 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2782, 32, 17, 64, 4352 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2783, 32, 17, 64, 4096 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2792, 32, 17, 64, 3840 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2793, 32, 17, 64, 3584 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2794, 32, 17, 64, 3328 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2795, 32, 17, 64, 3072 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2796, 32, 17, 64, 2816 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2797, 32, 17, 64, 2560 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2798, 32, 17, 64, 2304 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2799, 32, 17, 64, 2048 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2808, 32, 17, 64, 1792 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2809, 32, 17, 64, 1536 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2810, 32, 17, 64, 1280 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2811, 32, 17, 64, 1024 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2812, 32, 17, 64, 768 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2813, 32, 17, 64, 512 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2814, 32, 17, 64, 256 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2815, 32, 17, 64, 0 +; GFX90A-V2A-DIS-NEXT: ;;#ASMSTART +; GFX90A-V2A-DIS-NEXT: ; clobber nonpreserved SGPRs +; GFX90A-V2A-DIS-NEXT: ;;#ASMEND +; GFX90A-V2A-DIS-NEXT: ;;#ASMSTART +; GFX90A-V2A-DIS-NEXT: ; clobber all VGPRs +; GFX90A-V2A-DIS-NEXT: ;;#ASMEND +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v255, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: s_mov_b32 s32, s33 +; GFX90A-V2A-DIS-NEXT: .cfi_def_cfa_register 64 +; GFX90A-V2A-DIS-NEXT: s_mov_b32 s33, s40 +; GFX90A-V2A-DIS-NEXT: s_waitcnt vmcnt(0) +; GFX90A-V2A-DIS-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-V2A-EN-LABEL: callee_need_to_spill_fp_to_memory: +; GFX90A-V2A-EN: .Lfunc_begin2: +; GFX90A-V2A-EN-NEXT: .cfi_startproc +; GFX90A-V2A-EN-NEXT: ; %bb.0: +; GFX90A-V2A-EN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GFX90A-V2A-EN-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2560 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2561 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2562 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2563 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2564 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2565 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2566 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2567 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2568 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2569 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2570 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2571 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2572 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2573 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2574 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2575 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2576 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2577 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2578 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2579 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2580 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2581 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2582 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2583 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2584 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2585 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2586 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2587 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2588 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2589 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2590 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2591 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2592 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2593 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2594 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2595 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2596 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2597 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2598 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2599 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2608 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2609 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2610 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2611 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2612 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2613 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2614 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2615 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2624 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2625 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2626 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2627 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2628 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2629 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2630 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2631 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2640 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2641 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2642 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2643 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2644 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2645 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2646 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2647 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2656 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2657 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2658 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2659 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2660 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2661 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2662 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2663 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2672 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2673 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2674 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2675 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2676 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2677 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2678 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2679 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2688 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2689 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2690 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2691 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2692 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2693 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2694 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2695 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2704 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2705 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2706 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2707 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2708 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2709 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2710 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2711 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2720 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2721 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2722 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2723 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2724 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2725 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2726 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2727 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2736 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2737 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2738 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2739 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2740 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2741 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2742 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2743 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2752 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2753 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2754 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2755 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2756 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2757 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2758 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2759 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2768 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2769 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2770 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2771 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2772 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2773 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2774 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2775 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2784 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2785 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2786 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2787 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2788 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2789 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2790 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2791 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2800 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2801 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2802 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2803 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2804 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2805 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2806 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2807 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3072 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3073 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3074 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3075 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3076 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3077 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3078 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3079 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3080 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3081 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3082 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3083 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3084 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3085 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3086 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3087 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3088 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3089 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3090 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3091 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3092 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3093 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3094 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3095 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3096 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3097 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3098 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3099 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3100 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3101 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3102 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3103 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 36 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 37 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 38 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 39 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 40 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 41 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 42 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 43 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 44 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 45 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 46 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 47 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 48 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 49 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 50 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 51 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 52 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 53 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 54 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 55 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 56 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 57 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 58 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 59 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 60 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 61 +; GFX90A-V2A-EN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-V2A-EN-NEXT: s_mov_b32 s40, s33 +; GFX90A-V2A-EN-NEXT: .cfi_register 65, 72 +; GFX90A-V2A-EN-NEXT: s_mov_b32 s33, s32 +; GFX90A-V2A-EN-NEXT: .cfi_def_cfa_register 65 +; GFX90A-V2A-EN-NEXT: s_addk_i32 s32, 0x5100 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2600, 3072, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2601, 3073, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2602, 3074, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2603, 3075, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2604, 3076, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2605, 3077, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2606, 3078, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2607, 3079, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2616, 3080, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2617, 3081, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2618, 3082, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2619, 3083, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2620, 3084, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2621, 3085, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2622, 3086, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2623, 3087, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a16, v72 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2632, 3088, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a17, v73 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2633, 3089, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a18, v74 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2634, 3090, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a19, v75 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2635, 3091, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a20, v76 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2636, 3092, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a21, v77 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2637, 3093, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a22, v78 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2638, 3094, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a23, v79 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2639, 3095, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a24, v88 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2648, 3096, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a25, v89 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2649, 3097, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a26, v90 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2650, 3098, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a27, v91 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2651, 3099, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a28, v92 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2652, 3100, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a29, v93 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2653, 3101, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a30, v94 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2654, 3102, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a31, v95 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2655, 3103, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2664, 32, 17, 64, 20224 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2665, 32, 17, 64, 19968 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2666, 32, 17, 64, 19712 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2667, 32, 17, 64, 19456 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2668, 32, 17, 64, 19200 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2669, 32, 17, 64, 18944 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2670, 32, 17, 64, 18688 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2671, 32, 17, 64, 18432 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2680, 32, 17, 64, 18176 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2681, 32, 17, 64, 17920 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2682, 32, 17, 64, 17664 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2683, 32, 17, 64, 17408 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2684, 32, 17, 64, 17152 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2685, 32, 17, 64, 16896 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2686, 32, 17, 64, 16640 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2687, 32, 17, 64, 16384 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2696, 32, 17, 64, 16128 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2697, 32, 17, 64, 15872 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2698, 32, 17, 64, 15616 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2699, 32, 17, 64, 15360 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2700, 32, 17, 64, 15104 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2701, 32, 17, 64, 14848 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2702, 32, 17, 64, 14592 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2703, 32, 17, 64, 14336 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2712, 32, 17, 64, 14080 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2713, 32, 17, 64, 13824 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2714, 32, 17, 64, 13568 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2715, 32, 17, 64, 13312 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2716, 32, 17, 64, 13056 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2717, 32, 17, 64, 12800 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2718, 32, 17, 64, 12544 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2719, 32, 17, 64, 12288 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2728, 32, 17, 64, 12032 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2729, 32, 17, 64, 11776 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2730, 32, 17, 64, 11520 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2731, 32, 17, 64, 11264 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2732, 32, 17, 64, 11008 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2733, 32, 17, 64, 10752 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2734, 32, 17, 64, 10496 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2735, 32, 17, 64, 10240 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2744, 32, 17, 64, 9984 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2745, 32, 17, 64, 9728 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2746, 32, 17, 64, 9472 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2747, 32, 17, 64, 9216 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2748, 32, 17, 64, 8960 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2749, 32, 17, 64, 8704 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2750, 32, 17, 64, 8448 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2751, 32, 17, 64, 8192 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2760, 32, 17, 64, 7936 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2761, 32, 17, 64, 7680 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2762, 32, 17, 64, 7424 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2763, 32, 17, 64, 7168 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2764, 32, 17, 64, 6912 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2765, 32, 17, 64, 6656 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2766, 32, 17, 64, 6400 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2767, 32, 17, 64, 6144 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2776, 32, 17, 64, 5888 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2777, 32, 17, 64, 5632 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2778, 32, 17, 64, 5376 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2779, 32, 17, 64, 5120 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2780, 32, 17, 64, 4864 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2781, 32, 17, 64, 4608 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2782, 32, 17, 64, 4352 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2783, 32, 17, 64, 4096 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2792, 32, 17, 64, 3840 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2793, 32, 17, 64, 3584 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2794, 32, 17, 64, 3328 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2795, 32, 17, 64, 3072 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2796, 32, 17, 64, 2816 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2797, 32, 17, 64, 2560 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2798, 32, 17, 64, 2304 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2799, 32, 17, 64, 2048 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2808, 32, 17, 64, 1792 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2809, 32, 17, 64, 1536 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2810, 32, 17, 64, 1280 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2811, 32, 17, 64, 1024 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2812, 32, 17, 64, 768 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2813, 32, 17, 64, 512 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2814, 32, 17, 64, 256 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_offset 2815, 32, 17, 64, 0 +; GFX90A-V2A-EN-NEXT: ;;#ASMSTART +; GFX90A-V2A-EN-NEXT: ; clobber nonpreserved SGPRs +; GFX90A-V2A-EN-NEXT: ;;#ASMEND +; GFX90A-V2A-EN-NEXT: ;;#ASMSTART +; GFX90A-V2A-EN-NEXT: ; clobber all VGPRs +; GFX90A-V2A-EN-NEXT: ;;#ASMEND +; GFX90A-V2A-EN-NEXT: buffer_load_dword v255, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: buffer_load_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v95, a31 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v94, a30 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v93, a29 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v92, a28 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v91, a27 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v90, a26 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v89, a25 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v88, a24 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v79, a23 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v78, a22 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v77, a21 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v76, a20 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v75, a19 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v74, a18 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v73, a17 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v72, a16 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v63, a15 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v62, a14 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: s_mov_b32 s32, s33 +; GFX90A-V2A-EN-NEXT: .cfi_def_cfa_register 64 +; GFX90A-V2A-EN-NEXT: s_mov_b32 s33, s40 +; GFX90A-V2A-EN-NEXT: s_waitcnt vmcnt(0) +; GFX90A-V2A-EN-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: callee_need_to_spill_fp_to_memory: +; WAVE32: .Lfunc_begin2: +; WAVE32-NEXT: .cfi_startproc +; WAVE32-NEXT: ; %bb.0: +; WAVE32-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE32-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE32-NEXT: .cfi_undefined 1536 +; WAVE32-NEXT: .cfi_undefined 1537 +; WAVE32-NEXT: .cfi_undefined 1538 +; WAVE32-NEXT: .cfi_undefined 1539 +; WAVE32-NEXT: .cfi_undefined 1540 +; WAVE32-NEXT: .cfi_undefined 1541 +; WAVE32-NEXT: .cfi_undefined 1542 +; WAVE32-NEXT: .cfi_undefined 1543 +; WAVE32-NEXT: .cfi_undefined 1544 +; WAVE32-NEXT: .cfi_undefined 1545 +; WAVE32-NEXT: .cfi_undefined 1546 +; WAVE32-NEXT: .cfi_undefined 1547 +; WAVE32-NEXT: .cfi_undefined 1548 +; WAVE32-NEXT: .cfi_undefined 1549 +; WAVE32-NEXT: .cfi_undefined 1550 +; WAVE32-NEXT: .cfi_undefined 1551 +; WAVE32-NEXT: .cfi_undefined 1552 +; WAVE32-NEXT: .cfi_undefined 1553 +; WAVE32-NEXT: .cfi_undefined 1554 +; WAVE32-NEXT: .cfi_undefined 1555 +; WAVE32-NEXT: .cfi_undefined 1556 +; WAVE32-NEXT: .cfi_undefined 1557 +; WAVE32-NEXT: .cfi_undefined 1558 +; WAVE32-NEXT: .cfi_undefined 1559 +; WAVE32-NEXT: .cfi_undefined 1560 +; WAVE32-NEXT: .cfi_undefined 1561 +; WAVE32-NEXT: .cfi_undefined 1562 +; WAVE32-NEXT: .cfi_undefined 1563 +; WAVE32-NEXT: .cfi_undefined 1564 +; WAVE32-NEXT: .cfi_undefined 1565 +; WAVE32-NEXT: .cfi_undefined 1566 +; WAVE32-NEXT: .cfi_undefined 1567 +; WAVE32-NEXT: .cfi_undefined 1568 +; WAVE32-NEXT: .cfi_undefined 1569 +; WAVE32-NEXT: .cfi_undefined 1570 +; WAVE32-NEXT: .cfi_undefined 1571 +; WAVE32-NEXT: .cfi_undefined 1572 +; WAVE32-NEXT: .cfi_undefined 1573 +; WAVE32-NEXT: .cfi_undefined 1574 +; WAVE32-NEXT: .cfi_undefined 1575 +; WAVE32-NEXT: .cfi_undefined 1584 +; WAVE32-NEXT: .cfi_undefined 1585 +; WAVE32-NEXT: .cfi_undefined 1586 +; WAVE32-NEXT: .cfi_undefined 1587 +; WAVE32-NEXT: .cfi_undefined 1588 +; WAVE32-NEXT: .cfi_undefined 1589 +; WAVE32-NEXT: .cfi_undefined 1590 +; WAVE32-NEXT: .cfi_undefined 1591 +; WAVE32-NEXT: .cfi_undefined 1600 +; WAVE32-NEXT: .cfi_undefined 1601 +; WAVE32-NEXT: .cfi_undefined 1602 +; WAVE32-NEXT: .cfi_undefined 1603 +; WAVE32-NEXT: .cfi_undefined 1604 +; WAVE32-NEXT: .cfi_undefined 1605 +; WAVE32-NEXT: .cfi_undefined 1606 +; WAVE32-NEXT: .cfi_undefined 1607 +; WAVE32-NEXT: .cfi_undefined 1616 +; WAVE32-NEXT: .cfi_undefined 1617 +; WAVE32-NEXT: .cfi_undefined 1618 +; WAVE32-NEXT: .cfi_undefined 1619 +; WAVE32-NEXT: .cfi_undefined 1620 +; WAVE32-NEXT: .cfi_undefined 1621 +; WAVE32-NEXT: .cfi_undefined 1622 +; WAVE32-NEXT: .cfi_undefined 1623 +; WAVE32-NEXT: .cfi_undefined 1632 +; WAVE32-NEXT: .cfi_undefined 1633 +; WAVE32-NEXT: .cfi_undefined 1634 +; WAVE32-NEXT: .cfi_undefined 1635 +; WAVE32-NEXT: .cfi_undefined 1636 +; WAVE32-NEXT: .cfi_undefined 1637 +; WAVE32-NEXT: .cfi_undefined 1638 +; WAVE32-NEXT: .cfi_undefined 1639 +; WAVE32-NEXT: .cfi_undefined 1648 +; WAVE32-NEXT: .cfi_undefined 1649 +; WAVE32-NEXT: .cfi_undefined 1650 +; WAVE32-NEXT: .cfi_undefined 1651 +; WAVE32-NEXT: .cfi_undefined 1652 +; WAVE32-NEXT: .cfi_undefined 1653 +; WAVE32-NEXT: .cfi_undefined 1654 +; WAVE32-NEXT: .cfi_undefined 1655 +; WAVE32-NEXT: .cfi_undefined 1664 +; WAVE32-NEXT: .cfi_undefined 1665 +; WAVE32-NEXT: .cfi_undefined 1666 +; WAVE32-NEXT: .cfi_undefined 1667 +; WAVE32-NEXT: .cfi_undefined 1668 +; WAVE32-NEXT: .cfi_undefined 1669 +; WAVE32-NEXT: .cfi_undefined 1670 +; WAVE32-NEXT: .cfi_undefined 1671 +; WAVE32-NEXT: .cfi_undefined 1680 +; WAVE32-NEXT: .cfi_undefined 1681 +; WAVE32-NEXT: .cfi_undefined 1682 +; WAVE32-NEXT: .cfi_undefined 1683 +; WAVE32-NEXT: .cfi_undefined 1684 +; WAVE32-NEXT: .cfi_undefined 1685 +; WAVE32-NEXT: .cfi_undefined 1686 +; WAVE32-NEXT: .cfi_undefined 1687 +; WAVE32-NEXT: .cfi_undefined 1696 +; WAVE32-NEXT: .cfi_undefined 1697 +; WAVE32-NEXT: .cfi_undefined 1698 +; WAVE32-NEXT: .cfi_undefined 1699 +; WAVE32-NEXT: .cfi_undefined 1700 +; WAVE32-NEXT: .cfi_undefined 1701 +; WAVE32-NEXT: .cfi_undefined 1702 +; WAVE32-NEXT: .cfi_undefined 1703 +; WAVE32-NEXT: .cfi_undefined 1712 +; WAVE32-NEXT: .cfi_undefined 1713 +; WAVE32-NEXT: .cfi_undefined 1714 +; WAVE32-NEXT: .cfi_undefined 1715 +; WAVE32-NEXT: .cfi_undefined 1716 +; WAVE32-NEXT: .cfi_undefined 1717 +; WAVE32-NEXT: .cfi_undefined 1718 +; WAVE32-NEXT: .cfi_undefined 1719 +; WAVE32-NEXT: .cfi_undefined 1728 +; WAVE32-NEXT: .cfi_undefined 1729 +; WAVE32-NEXT: .cfi_undefined 1730 +; WAVE32-NEXT: .cfi_undefined 1731 +; WAVE32-NEXT: .cfi_undefined 1732 +; WAVE32-NEXT: .cfi_undefined 1733 +; WAVE32-NEXT: .cfi_undefined 1734 +; WAVE32-NEXT: .cfi_undefined 1735 +; WAVE32-NEXT: .cfi_undefined 1744 +; WAVE32-NEXT: .cfi_undefined 1745 +; WAVE32-NEXT: .cfi_undefined 1746 +; WAVE32-NEXT: .cfi_undefined 1747 +; WAVE32-NEXT: .cfi_undefined 1748 +; WAVE32-NEXT: .cfi_undefined 1749 +; WAVE32-NEXT: .cfi_undefined 1750 +; WAVE32-NEXT: .cfi_undefined 1751 +; WAVE32-NEXT: .cfi_undefined 1760 +; WAVE32-NEXT: .cfi_undefined 1761 +; WAVE32-NEXT: .cfi_undefined 1762 +; WAVE32-NEXT: .cfi_undefined 1763 +; WAVE32-NEXT: .cfi_undefined 1764 +; WAVE32-NEXT: .cfi_undefined 1765 +; WAVE32-NEXT: .cfi_undefined 1766 +; WAVE32-NEXT: .cfi_undefined 1767 +; WAVE32-NEXT: .cfi_undefined 1776 +; WAVE32-NEXT: .cfi_undefined 1777 +; WAVE32-NEXT: .cfi_undefined 1778 +; WAVE32-NEXT: .cfi_undefined 1779 +; WAVE32-NEXT: .cfi_undefined 1780 +; WAVE32-NEXT: .cfi_undefined 1781 +; WAVE32-NEXT: .cfi_undefined 1782 +; WAVE32-NEXT: .cfi_undefined 1783 +; WAVE32-NEXT: .cfi_undefined 36 +; WAVE32-NEXT: .cfi_undefined 37 +; WAVE32-NEXT: .cfi_undefined 38 +; WAVE32-NEXT: .cfi_undefined 39 +; WAVE32-NEXT: .cfi_undefined 40 +; WAVE32-NEXT: .cfi_undefined 41 +; WAVE32-NEXT: .cfi_undefined 42 +; WAVE32-NEXT: .cfi_undefined 43 +; WAVE32-NEXT: .cfi_undefined 44 +; WAVE32-NEXT: .cfi_undefined 45 +; WAVE32-NEXT: .cfi_undefined 46 +; WAVE32-NEXT: .cfi_undefined 47 +; WAVE32-NEXT: .cfi_undefined 48 +; WAVE32-NEXT: .cfi_undefined 49 +; WAVE32-NEXT: .cfi_undefined 50 +; WAVE32-NEXT: .cfi_undefined 51 +; WAVE32-NEXT: .cfi_undefined 52 +; WAVE32-NEXT: .cfi_undefined 53 +; WAVE32-NEXT: .cfi_undefined 54 +; WAVE32-NEXT: .cfi_undefined 55 +; WAVE32-NEXT: .cfi_undefined 56 +; WAVE32-NEXT: .cfi_undefined 57 +; WAVE32-NEXT: .cfi_undefined 58 +; WAVE32-NEXT: .cfi_undefined 59 +; WAVE32-NEXT: .cfi_undefined 60 +; WAVE32-NEXT: .cfi_undefined 61 +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: s_mov_b32 s40, s33 +; WAVE32-NEXT: .cfi_register 65, 72 +; WAVE32-NEXT: s_mov_b32 s33, s32 +; WAVE32-NEXT: .cfi_def_cfa_register 65 +; WAVE32-NEXT: s_addk_i32 s32, 0x3880 +; WAVE32-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1576, 32, 1, 32, 14208 +; WAVE32-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1577, 32, 1, 32, 14080 +; WAVE32-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1578, 32, 1, 32, 13952 +; WAVE32-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1579, 32, 1, 32, 13824 +; WAVE32-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1580, 32, 1, 32, 13696 +; WAVE32-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1581, 32, 1, 32, 13568 +; WAVE32-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1582, 32, 1, 32, 13440 +; WAVE32-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1583, 32, 1, 32, 13312 +; WAVE32-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1592, 32, 1, 32, 13184 +; WAVE32-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1593, 32, 1, 32, 13056 +; WAVE32-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1594, 32, 1, 32, 12928 +; WAVE32-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1595, 32, 1, 32, 12800 +; WAVE32-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1596, 32, 1, 32, 12672 +; WAVE32-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1597, 32, 1, 32, 12544 +; WAVE32-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1598, 32, 1, 32, 12416 +; WAVE32-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1599, 32, 1, 32, 12288 +; WAVE32-NEXT: buffer_store_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1608, 32, 1, 32, 12160 +; WAVE32-NEXT: buffer_store_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1609, 32, 1, 32, 12032 +; WAVE32-NEXT: buffer_store_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1610, 32, 1, 32, 11904 +; WAVE32-NEXT: buffer_store_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1611, 32, 1, 32, 11776 +; WAVE32-NEXT: buffer_store_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1612, 32, 1, 32, 11648 +; WAVE32-NEXT: buffer_store_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1613, 32, 1, 32, 11520 +; WAVE32-NEXT: buffer_store_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1614, 32, 1, 32, 11392 +; WAVE32-NEXT: buffer_store_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1615, 32, 1, 32, 11264 +; WAVE32-NEXT: buffer_store_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1624, 32, 1, 32, 11136 +; WAVE32-NEXT: buffer_store_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1625, 32, 1, 32, 11008 +; WAVE32-NEXT: buffer_store_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1626, 32, 1, 32, 10880 +; WAVE32-NEXT: buffer_store_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1627, 32, 1, 32, 10752 +; WAVE32-NEXT: buffer_store_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1628, 32, 1, 32, 10624 +; WAVE32-NEXT: buffer_store_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1629, 32, 1, 32, 10496 +; WAVE32-NEXT: buffer_store_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1630, 32, 1, 32, 10368 +; WAVE32-NEXT: buffer_store_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1631, 32, 1, 32, 10240 +; WAVE32-NEXT: buffer_store_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1640, 32, 1, 32, 10112 +; WAVE32-NEXT: buffer_store_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1641, 32, 1, 32, 9984 +; WAVE32-NEXT: buffer_store_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1642, 32, 1, 32, 9856 +; WAVE32-NEXT: buffer_store_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1643, 32, 1, 32, 9728 +; WAVE32-NEXT: buffer_store_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1644, 32, 1, 32, 9600 +; WAVE32-NEXT: buffer_store_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1645, 32, 1, 32, 9472 +; WAVE32-NEXT: buffer_store_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1646, 32, 1, 32, 9344 +; WAVE32-NEXT: buffer_store_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1647, 32, 1, 32, 9216 +; WAVE32-NEXT: buffer_store_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1656, 32, 1, 32, 9088 +; WAVE32-NEXT: buffer_store_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1657, 32, 1, 32, 8960 +; WAVE32-NEXT: buffer_store_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1658, 32, 1, 32, 8832 +; WAVE32-NEXT: buffer_store_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1659, 32, 1, 32, 8704 +; WAVE32-NEXT: buffer_store_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1660, 32, 1, 32, 8576 +; WAVE32-NEXT: buffer_store_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1661, 32, 1, 32, 8448 +; WAVE32-NEXT: buffer_store_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1662, 32, 1, 32, 8320 +; WAVE32-NEXT: buffer_store_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1663, 32, 1, 32, 8192 +; WAVE32-NEXT: buffer_store_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1672, 32, 1, 32, 8064 +; WAVE32-NEXT: buffer_store_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1673, 32, 1, 32, 7936 +; WAVE32-NEXT: buffer_store_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1674, 32, 1, 32, 7808 +; WAVE32-NEXT: buffer_store_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1675, 32, 1, 32, 7680 +; WAVE32-NEXT: buffer_store_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1676, 32, 1, 32, 7552 +; WAVE32-NEXT: buffer_store_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1677, 32, 1, 32, 7424 +; WAVE32-NEXT: buffer_store_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1678, 32, 1, 32, 7296 +; WAVE32-NEXT: buffer_store_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1679, 32, 1, 32, 7168 +; WAVE32-NEXT: buffer_store_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1688, 32, 1, 32, 7040 +; WAVE32-NEXT: buffer_store_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1689, 32, 1, 32, 6912 +; WAVE32-NEXT: buffer_store_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1690, 32, 1, 32, 6784 +; WAVE32-NEXT: buffer_store_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1691, 32, 1, 32, 6656 +; WAVE32-NEXT: buffer_store_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1692, 32, 1, 32, 6528 +; WAVE32-NEXT: buffer_store_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1693, 32, 1, 32, 6400 +; WAVE32-NEXT: buffer_store_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1694, 32, 1, 32, 6272 +; WAVE32-NEXT: buffer_store_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1695, 32, 1, 32, 6144 +; WAVE32-NEXT: buffer_store_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1704, 32, 1, 32, 6016 +; WAVE32-NEXT: buffer_store_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1705, 32, 1, 32, 5888 +; WAVE32-NEXT: buffer_store_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1706, 32, 1, 32, 5760 +; WAVE32-NEXT: buffer_store_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1707, 32, 1, 32, 5632 +; WAVE32-NEXT: buffer_store_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1708, 32, 1, 32, 5504 +; WAVE32-NEXT: buffer_store_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1709, 32, 1, 32, 5376 +; WAVE32-NEXT: buffer_store_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1710, 32, 1, 32, 5248 +; WAVE32-NEXT: buffer_store_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1711, 32, 1, 32, 5120 +; WAVE32-NEXT: buffer_store_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1720, 32, 1, 32, 4992 +; WAVE32-NEXT: buffer_store_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1721, 32, 1, 32, 4864 +; WAVE32-NEXT: buffer_store_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1722, 32, 1, 32, 4736 +; WAVE32-NEXT: buffer_store_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1723, 32, 1, 32, 4608 +; WAVE32-NEXT: buffer_store_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1724, 32, 1, 32, 4480 +; WAVE32-NEXT: buffer_store_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1725, 32, 1, 32, 4352 +; WAVE32-NEXT: buffer_store_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1726, 32, 1, 32, 4224 +; WAVE32-NEXT: buffer_store_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1727, 32, 1, 32, 4096 +; WAVE32-NEXT: buffer_store_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1736, 32, 1, 32, 3968 +; WAVE32-NEXT: buffer_store_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1737, 32, 1, 32, 3840 +; WAVE32-NEXT: buffer_store_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1738, 32, 1, 32, 3712 +; WAVE32-NEXT: buffer_store_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1739, 32, 1, 32, 3584 +; WAVE32-NEXT: buffer_store_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1740, 32, 1, 32, 3456 +; WAVE32-NEXT: buffer_store_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1741, 32, 1, 32, 3328 +; WAVE32-NEXT: buffer_store_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1742, 32, 1, 32, 3200 +; WAVE32-NEXT: buffer_store_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1743, 32, 1, 32, 3072 +; WAVE32-NEXT: buffer_store_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1752, 32, 1, 32, 2944 +; WAVE32-NEXT: buffer_store_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1753, 32, 1, 32, 2816 +; WAVE32-NEXT: buffer_store_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1754, 32, 1, 32, 2688 +; WAVE32-NEXT: buffer_store_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1755, 32, 1, 32, 2560 +; WAVE32-NEXT: buffer_store_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1756, 32, 1, 32, 2432 +; WAVE32-NEXT: buffer_store_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1757, 32, 1, 32, 2304 +; WAVE32-NEXT: buffer_store_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1758, 32, 1, 32, 2176 +; WAVE32-NEXT: buffer_store_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1759, 32, 1, 32, 2048 +; WAVE32-NEXT: buffer_store_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1768, 32, 1, 32, 1920 +; WAVE32-NEXT: buffer_store_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1769, 32, 1, 32, 1792 +; WAVE32-NEXT: buffer_store_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1770, 32, 1, 32, 1664 +; WAVE32-NEXT: buffer_store_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1771, 32, 1, 32, 1536 +; WAVE32-NEXT: buffer_store_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1772, 32, 1, 32, 1408 +; WAVE32-NEXT: buffer_store_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1773, 32, 1, 32, 1280 +; WAVE32-NEXT: buffer_store_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1774, 32, 1, 32, 1152 +; WAVE32-NEXT: buffer_store_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1775, 32, 1, 32, 1024 +; WAVE32-NEXT: buffer_store_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1784, 32, 1, 32, 896 +; WAVE32-NEXT: buffer_store_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1785, 32, 1, 32, 768 +; WAVE32-NEXT: buffer_store_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1786, 32, 1, 32, 640 +; WAVE32-NEXT: buffer_store_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1787, 32, 1, 32, 512 +; WAVE32-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1788, 32, 1, 32, 384 +; WAVE32-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1789, 32, 1, 32, 256 +; WAVE32-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1790, 32, 1, 32, 128 +; WAVE32-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1791, 32, 1, 32, 0 +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber nonpreserved SGPRs +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber all VGPRs +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: s_clause 0x3e +; WAVE32-NEXT: buffer_load_dword v255, off, s[0:3], s33 +; WAVE32-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:4 +; WAVE32-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:8 +; WAVE32-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:12 +; WAVE32-NEXT: buffer_load_dword v251, off, s[0:3], s33 offset:16 +; WAVE32-NEXT: buffer_load_dword v250, off, s[0:3], s33 offset:20 +; WAVE32-NEXT: buffer_load_dword v249, off, s[0:3], s33 offset:24 +; WAVE32-NEXT: buffer_load_dword v248, off, s[0:3], s33 offset:28 +; WAVE32-NEXT: buffer_load_dword v239, off, s[0:3], s33 offset:32 +; WAVE32-NEXT: buffer_load_dword v238, off, s[0:3], s33 offset:36 +; WAVE32-NEXT: buffer_load_dword v237, off, s[0:3], s33 offset:40 +; WAVE32-NEXT: buffer_load_dword v236, off, s[0:3], s33 offset:44 +; WAVE32-NEXT: buffer_load_dword v235, off, s[0:3], s33 offset:48 +; WAVE32-NEXT: buffer_load_dword v234, off, s[0:3], s33 offset:52 +; WAVE32-NEXT: buffer_load_dword v233, off, s[0:3], s33 offset:56 +; WAVE32-NEXT: buffer_load_dword v232, off, s[0:3], s33 offset:60 +; WAVE32-NEXT: buffer_load_dword v223, off, s[0:3], s33 offset:64 +; WAVE32-NEXT: buffer_load_dword v222, off, s[0:3], s33 offset:68 +; WAVE32-NEXT: buffer_load_dword v221, off, s[0:3], s33 offset:72 +; WAVE32-NEXT: buffer_load_dword v220, off, s[0:3], s33 offset:76 +; WAVE32-NEXT: buffer_load_dword v219, off, s[0:3], s33 offset:80 +; WAVE32-NEXT: buffer_load_dword v218, off, s[0:3], s33 offset:84 +; WAVE32-NEXT: buffer_load_dword v217, off, s[0:3], s33 offset:88 +; WAVE32-NEXT: buffer_load_dword v216, off, s[0:3], s33 offset:92 +; WAVE32-NEXT: buffer_load_dword v207, off, s[0:3], s33 offset:96 +; WAVE32-NEXT: buffer_load_dword v206, off, s[0:3], s33 offset:100 +; WAVE32-NEXT: buffer_load_dword v205, off, s[0:3], s33 offset:104 +; WAVE32-NEXT: buffer_load_dword v204, off, s[0:3], s33 offset:108 +; WAVE32-NEXT: buffer_load_dword v203, off, s[0:3], s33 offset:112 +; WAVE32-NEXT: buffer_load_dword v202, off, s[0:3], s33 offset:116 +; WAVE32-NEXT: buffer_load_dword v201, off, s[0:3], s33 offset:120 +; WAVE32-NEXT: buffer_load_dword v200, off, s[0:3], s33 offset:124 +; WAVE32-NEXT: buffer_load_dword v191, off, s[0:3], s33 offset:128 +; WAVE32-NEXT: buffer_load_dword v190, off, s[0:3], s33 offset:132 +; WAVE32-NEXT: buffer_load_dword v189, off, s[0:3], s33 offset:136 +; WAVE32-NEXT: buffer_load_dword v188, off, s[0:3], s33 offset:140 +; WAVE32-NEXT: buffer_load_dword v187, off, s[0:3], s33 offset:144 +; WAVE32-NEXT: buffer_load_dword v186, off, s[0:3], s33 offset:148 +; WAVE32-NEXT: buffer_load_dword v185, off, s[0:3], s33 offset:152 +; WAVE32-NEXT: buffer_load_dword v184, off, s[0:3], s33 offset:156 +; WAVE32-NEXT: buffer_load_dword v175, off, s[0:3], s33 offset:160 +; WAVE32-NEXT: buffer_load_dword v174, off, s[0:3], s33 offset:164 +; WAVE32-NEXT: buffer_load_dword v173, off, s[0:3], s33 offset:168 +; WAVE32-NEXT: buffer_load_dword v172, off, s[0:3], s33 offset:172 +; WAVE32-NEXT: buffer_load_dword v171, off, s[0:3], s33 offset:176 +; WAVE32-NEXT: buffer_load_dword v170, off, s[0:3], s33 offset:180 +; WAVE32-NEXT: buffer_load_dword v169, off, s[0:3], s33 offset:184 +; WAVE32-NEXT: buffer_load_dword v168, off, s[0:3], s33 offset:188 +; WAVE32-NEXT: buffer_load_dword v159, off, s[0:3], s33 offset:192 +; WAVE32-NEXT: buffer_load_dword v158, off, s[0:3], s33 offset:196 +; WAVE32-NEXT: buffer_load_dword v157, off, s[0:3], s33 offset:200 +; WAVE32-NEXT: buffer_load_dword v156, off, s[0:3], s33 offset:204 +; WAVE32-NEXT: buffer_load_dword v155, off, s[0:3], s33 offset:208 +; WAVE32-NEXT: buffer_load_dword v154, off, s[0:3], s33 offset:212 +; WAVE32-NEXT: buffer_load_dword v153, off, s[0:3], s33 offset:216 +; WAVE32-NEXT: buffer_load_dword v152, off, s[0:3], s33 offset:220 +; WAVE32-NEXT: buffer_load_dword v143, off, s[0:3], s33 offset:224 +; WAVE32-NEXT: buffer_load_dword v142, off, s[0:3], s33 offset:228 +; WAVE32-NEXT: buffer_load_dword v141, off, s[0:3], s33 offset:232 +; WAVE32-NEXT: buffer_load_dword v140, off, s[0:3], s33 offset:236 +; WAVE32-NEXT: buffer_load_dword v139, off, s[0:3], s33 offset:240 +; WAVE32-NEXT: buffer_load_dword v138, off, s[0:3], s33 offset:244 +; WAVE32-NEXT: buffer_load_dword v137, off, s[0:3], s33 offset:248 +; WAVE32-NEXT: s_clause 0x30 +; WAVE32-NEXT: buffer_load_dword v136, off, s[0:3], s33 offset:252 +; WAVE32-NEXT: buffer_load_dword v127, off, s[0:3], s33 offset:256 +; WAVE32-NEXT: buffer_load_dword v126, off, s[0:3], s33 offset:260 +; WAVE32-NEXT: buffer_load_dword v125, off, s[0:3], s33 offset:264 +; WAVE32-NEXT: buffer_load_dword v124, off, s[0:3], s33 offset:268 +; WAVE32-NEXT: buffer_load_dword v123, off, s[0:3], s33 offset:272 +; WAVE32-NEXT: buffer_load_dword v122, off, s[0:3], s33 offset:276 +; WAVE32-NEXT: buffer_load_dword v121, off, s[0:3], s33 offset:280 +; WAVE32-NEXT: buffer_load_dword v120, off, s[0:3], s33 offset:284 +; WAVE32-NEXT: buffer_load_dword v111, off, s[0:3], s33 offset:288 +; WAVE32-NEXT: buffer_load_dword v110, off, s[0:3], s33 offset:292 +; WAVE32-NEXT: buffer_load_dword v109, off, s[0:3], s33 offset:296 +; WAVE32-NEXT: buffer_load_dword v108, off, s[0:3], s33 offset:300 +; WAVE32-NEXT: buffer_load_dword v107, off, s[0:3], s33 offset:304 +; WAVE32-NEXT: buffer_load_dword v106, off, s[0:3], s33 offset:308 +; WAVE32-NEXT: buffer_load_dword v105, off, s[0:3], s33 offset:312 +; WAVE32-NEXT: buffer_load_dword v104, off, s[0:3], s33 offset:316 +; WAVE32-NEXT: buffer_load_dword v95, off, s[0:3], s33 offset:320 +; WAVE32-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:324 +; WAVE32-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:328 +; WAVE32-NEXT: buffer_load_dword v92, off, s[0:3], s33 offset:332 +; WAVE32-NEXT: buffer_load_dword v91, off, s[0:3], s33 offset:336 +; WAVE32-NEXT: buffer_load_dword v90, off, s[0:3], s33 offset:340 +; WAVE32-NEXT: buffer_load_dword v89, off, s[0:3], s33 offset:344 +; WAVE32-NEXT: buffer_load_dword v88, off, s[0:3], s33 offset:348 +; WAVE32-NEXT: buffer_load_dword v79, off, s[0:3], s33 offset:352 +; WAVE32-NEXT: buffer_load_dword v78, off, s[0:3], s33 offset:356 +; WAVE32-NEXT: buffer_load_dword v77, off, s[0:3], s33 offset:360 +; WAVE32-NEXT: buffer_load_dword v76, off, s[0:3], s33 offset:364 +; WAVE32-NEXT: buffer_load_dword v75, off, s[0:3], s33 offset:368 +; WAVE32-NEXT: buffer_load_dword v74, off, s[0:3], s33 offset:372 +; WAVE32-NEXT: buffer_load_dword v73, off, s[0:3], s33 offset:376 +; WAVE32-NEXT: buffer_load_dword v72, off, s[0:3], s33 offset:380 +; WAVE32-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:384 +; WAVE32-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:388 +; WAVE32-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:392 +; WAVE32-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:396 +; WAVE32-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:400 +; WAVE32-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:404 +; WAVE32-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:408 +; WAVE32-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:412 +; WAVE32-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:416 +; WAVE32-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:420 +; WAVE32-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:424 +; WAVE32-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:428 +; WAVE32-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:432 +; WAVE32-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436 +; WAVE32-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 +; WAVE32-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 +; WAVE32-NEXT: s_mov_b32 s32, s33 +; WAVE32-NEXT: .cfi_def_cfa_register 64 +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 s33, s40 +; WAVE32-NEXT: s_waitcnt vmcnt(0) +; WAVE32-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber nonpreserved SGPRs", "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} @@ -101,535 +2223,1173 @@ define void @callee_need_to_spill_fp_to_memory() #1 { declare hidden void @ex() #0 -; CHECK-LABEL: func_call_clobber: -; CHECK: .cfi_startproc - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: %bb.0: -; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 -; CHECK-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 - -; VGPR0_wave64 = 2560 -; WAVE64-NEXT: .cfi_undefined 2560 -; WAVE64-NEXT: .cfi_undefined 2561 -; WAVE64-NEXT: .cfi_undefined 2562 -; WAVE64-NEXT: .cfi_undefined 2563 -; WAVE64-NEXT: .cfi_undefined 2564 -; WAVE64-NEXT: .cfi_undefined 2565 -; WAVE64-NEXT: .cfi_undefined 2566 -; WAVE64-NEXT: .cfi_undefined 2567 -; WAVE64-NEXT: .cfi_undefined 2568 -; WAVE64-NEXT: .cfi_undefined 2569 -; WAVE64-NEXT: .cfi_undefined 2570 -; WAVE64-NEXT: .cfi_undefined 2571 -; WAVE64-NEXT: .cfi_undefined 2572 -; WAVE64-NEXT: .cfi_undefined 2573 -; WAVE64-NEXT: .cfi_undefined 2574 -; WAVE64-NEXT: .cfi_undefined 2575 -; WAVE64-NEXT: .cfi_undefined 2576 -; WAVE64-NEXT: .cfi_undefined 2577 -; WAVE64-NEXT: .cfi_undefined 2578 -; WAVE64-NEXT: .cfi_undefined 2579 -; WAVE64-NEXT: .cfi_undefined 2580 -; WAVE64-NEXT: .cfi_undefined 2581 -; WAVE64-NEXT: .cfi_undefined 2582 -; WAVE64-NEXT: .cfi_undefined 2583 -; WAVE64-NEXT: .cfi_undefined 2584 -; WAVE64-NEXT: .cfi_undefined 2585 -; WAVE64-NEXT: .cfi_undefined 2586 -; WAVE64-NEXT: .cfi_undefined 2587 -; WAVE64-NEXT: .cfi_undefined 2588 -; WAVE64-NEXT: .cfi_undefined 2589 -; WAVE64-NEXT: .cfi_undefined 2590 -; WAVE64-NEXT: .cfi_undefined 2591 -; WAVE64-NEXT: .cfi_undefined 2592 -; WAVE64-NEXT: .cfi_undefined 2593 -; WAVE64-NEXT: .cfi_undefined 2594 -; WAVE64-NEXT: .cfi_undefined 2595 -; WAVE64-NEXT: .cfi_undefined 2596 -; WAVE64-NEXT: .cfi_undefined 2597 -; WAVE64-NEXT: .cfi_undefined 2598 -; WAVE64-NEXT: .cfi_undefined 2599 - -; VPGR48_wave64 = 2608 -; WAVE64-NEXT: .cfi_undefined 2608 -; WAVE64-NEXT: .cfi_undefined 2609 -; WAVE64-NEXT: .cfi_undefined 2610 -; WAVE64-NEXT: .cfi_undefined 2611 -; WAVE64-NEXT: .cfi_undefined 2612 -; WAVE64-NEXT: .cfi_undefined 2613 -; WAVE64-NEXT: .cfi_undefined 2614 -; WAVE64-NEXT: .cfi_undefined 2615 - -; WAVE64-NEXT: .cfi_undefined 2624 -; WAVE64-NEXT: .cfi_undefined 2625 -; WAVE64-NEXT: .cfi_undefined 2626 -; WAVE64-NEXT: .cfi_undefined 2627 -; WAVE64-NEXT: .cfi_undefined 2628 -; WAVE64-NEXT: .cfi_undefined 2629 -; WAVE64-NEXT: .cfi_undefined 2630 -; WAVE64-NEXT: .cfi_undefined 2631 - -; WAVE64-NEXT: .cfi_undefined 2640 -; WAVE64-NEXT: .cfi_undefined 2641 -; WAVE64-NEXT: .cfi_undefined 2642 -; WAVE64-NEXT: .cfi_undefined 2643 -; WAVE64-NEXT: .cfi_undefined 2644 -; WAVE64-NEXT: .cfi_undefined 2645 -; WAVE64-NEXT: .cfi_undefined 2646 -; WAVE64-NEXT: .cfi_undefined 2647 - -; WAVE64-NEXT: .cfi_undefined 2656 -; WAVE64-NEXT: .cfi_undefined 2657 -; WAVE64-NEXT: .cfi_undefined 2658 -; WAVE64-NEXT: .cfi_undefined 2659 -; WAVE64-NEXT: .cfi_undefined 2660 -; WAVE64-NEXT: .cfi_undefined 2661 -; WAVE64-NEXT: .cfi_undefined 2662 -; WAVE64-NEXT: .cfi_undefined 2663 - -; WAVE64-NEXT: .cfi_undefined 2672 -; WAVE64-NEXT: .cfi_undefined 2673 -; WAVE64-NEXT: .cfi_undefined 2674 -; WAVE64-NEXT: .cfi_undefined 2675 -; WAVE64-NEXT: .cfi_undefined 2676 -; WAVE64-NEXT: .cfi_undefined 2677 -; WAVE64-NEXT: .cfi_undefined 2678 -; WAVE64-NEXT: .cfi_undefined 2679 - -; WAVE64-NEXT: .cfi_undefined 2688 -; WAVE64-NEXT: .cfi_undefined 2689 -; WAVE64-NEXT: .cfi_undefined 2690 -; WAVE64-NEXT: .cfi_undefined 2691 -; WAVE64-NEXT: .cfi_undefined 2692 -; WAVE64-NEXT: .cfi_undefined 2693 -; WAVE64-NEXT: .cfi_undefined 2694 -; WAVE64-NEXT: .cfi_undefined 2695 - -; WAVE64-NEXT: .cfi_undefined 2704 -; WAVE64-NEXT: .cfi_undefined 2705 -; WAVE64-NEXT: .cfi_undefined 2706 -; WAVE64-NEXT: .cfi_undefined 2707 -; WAVE64-NEXT: .cfi_undefined 2708 -; WAVE64-NEXT: .cfi_undefined 2709 -; WAVE64-NEXT: .cfi_undefined 2710 -; WAVE64-NEXT: .cfi_undefined 2711 - -; WAVE64-NEXT: .cfi_undefined 2720 -; WAVE64-NEXT: .cfi_undefined 2721 -; WAVE64-NEXT: .cfi_undefined 2722 -; WAVE64-NEXT: .cfi_undefined 2723 -; WAVE64-NEXT: .cfi_undefined 2724 -; WAVE64-NEXT: .cfi_undefined 2725 -; WAVE64-NEXT: .cfi_undefined 2726 -; WAVE64-NEXT: .cfi_undefined 2727 - -; WAVE64-NEXT: .cfi_undefined 2736 -; WAVE64-NEXT: .cfi_undefined 2737 -; WAVE64-NEXT: .cfi_undefined 2738 -; WAVE64-NEXT: .cfi_undefined 2739 -; WAVE64-NEXT: .cfi_undefined 2740 -; WAVE64-NEXT: .cfi_undefined 2741 -; WAVE64-NEXT: .cfi_undefined 2742 -; WAVE64-NEXT: .cfi_undefined 2743 - -; WAVE64-NEXT: .cfi_undefined 2752 -; WAVE64-NEXT: .cfi_undefined 2753 -; WAVE64-NEXT: .cfi_undefined 2754 -; WAVE64-NEXT: .cfi_undefined 2755 -; WAVE64-NEXT: .cfi_undefined 2756 -; WAVE64-NEXT: .cfi_undefined 2757 -; WAVE64-NEXT: .cfi_undefined 2758 -; WAVE64-NEXT: .cfi_undefined 2759 - -; WAVE64-NEXT: .cfi_undefined 2768 -; WAVE64-NEXT: .cfi_undefined 2769 -; WAVE64-NEXT: .cfi_undefined 2770 -; WAVE64-NEXT: .cfi_undefined 2771 -; WAVE64-NEXT: .cfi_undefined 2772 -; WAVE64-NEXT: .cfi_undefined 2773 -; WAVE64-NEXT: .cfi_undefined 2774 -; WAVE64-NEXT: .cfi_undefined 2775 - -; WAVE64-NEXT: .cfi_undefined 2784 -; WAVE64-NEXT: .cfi_undefined 2785 -; WAVE64-NEXT: .cfi_undefined 2786 -; WAVE64-NEXT: .cfi_undefined 2787 -; WAVE64-NEXT: .cfi_undefined 2788 -; WAVE64-NEXT: .cfi_undefined 2789 -; WAVE64-NEXT: .cfi_undefined 2790 -; WAVE64-NEXT: .cfi_undefined 2791 - -; WAVE64-NEXT: .cfi_undefined 2800 -; WAVE64-NEXT: .cfi_undefined 2801 -; WAVE64-NEXT: .cfi_undefined 2802 -; WAVE64-NEXT: .cfi_undefined 2803 -; WAVE64-NEXT: .cfi_undefined 2804 -; WAVE64-NEXT: .cfi_undefined 2805 -; WAVE64-NEXT: .cfi_undefined 2806 -; WAVE64-NEXT: .cfi_undefined 2807 - -; AGPR0_wave64 = 3072 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3072 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3073 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3074 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3075 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3076 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3077 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3078 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3079 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3080 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3081 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3082 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3083 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3084 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3085 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3086 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3087 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3088 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3089 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3090 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3091 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3092 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3093 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3094 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3095 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3096 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3097 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3098 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3099 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3100 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3101 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3102 -; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3103 - -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3072 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3073 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3074 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3075 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3076 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3077 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3078 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3079 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3080 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3081 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3082 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3083 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3084 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3085 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3086 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3087 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3088 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3089 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3090 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3091 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3092 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3093 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3094 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3095 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3096 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3097 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3098 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3099 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3100 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3101 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3102 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3103 - -; VGPR0_wave32 = 1536 -; WAVE32-NEXT: .cfi_undefined 1536 -; WAVE32-NEXT: .cfi_undefined 1537 -; WAVE32-NEXT: .cfi_undefined 1538 -; WAVE32-NEXT: .cfi_undefined 1539 -; WAVE32-NEXT: .cfi_undefined 1540 -; WAVE32-NEXT: .cfi_undefined 1541 -; WAVE32-NEXT: .cfi_undefined 1542 -; WAVE32-NEXT: .cfi_undefined 1543 -; WAVE32-NEXT: .cfi_undefined 1544 -; WAVE32-NEXT: .cfi_undefined 1545 -; WAVE32-NEXT: .cfi_undefined 1546 -; WAVE32-NEXT: .cfi_undefined 1547 -; WAVE32-NEXT: .cfi_undefined 1548 -; WAVE32-NEXT: .cfi_undefined 1549 -; WAVE32-NEXT: .cfi_undefined 1550 -; WAVE32-NEXT: .cfi_undefined 1551 -; WAVE32-NEXT: .cfi_undefined 1552 -; WAVE32-NEXT: .cfi_undefined 1553 -; WAVE32-NEXT: .cfi_undefined 1554 -; WAVE32-NEXT: .cfi_undefined 1555 -; WAVE32-NEXT: .cfi_undefined 1556 -; WAVE32-NEXT: .cfi_undefined 1557 -; WAVE32-NEXT: .cfi_undefined 1558 -; WAVE32-NEXT: .cfi_undefined 1559 -; WAVE32-NEXT: .cfi_undefined 1560 -; WAVE32-NEXT: .cfi_undefined 1561 -; WAVE32-NEXT: .cfi_undefined 1562 -; WAVE32-NEXT: .cfi_undefined 1563 -; WAVE32-NEXT: .cfi_undefined 1564 -; WAVE32-NEXT: .cfi_undefined 1565 -; WAVE32-NEXT: .cfi_undefined 1566 -; WAVE32-NEXT: .cfi_undefined 1567 -; WAVE32-NEXT: .cfi_undefined 1568 -; WAVE32-NEXT: .cfi_undefined 1569 -; WAVE32-NEXT: .cfi_undefined 1570 -; WAVE32-NEXT: .cfi_undefined 1571 -; WAVE32-NEXT: .cfi_undefined 1572 -; WAVE32-NEXT: .cfi_undefined 1573 -; WAVE32-NEXT: .cfi_undefined 1574 -; WAVE32-NEXT: .cfi_undefined 1575 - -; VPGR48_wave64 = 1584 -; WAVE32-NEXT: .cfi_undefined 1584 -; WAVE32-NEXT: .cfi_undefined 1585 -; WAVE32-NEXT: .cfi_undefined 1586 -; WAVE32-NEXT: .cfi_undefined 1587 -; WAVE32-NEXT: .cfi_undefined 1588 -; WAVE32-NEXT: .cfi_undefined 1589 -; WAVE32-NEXT: .cfi_undefined 1590 -; WAVE32-NEXT: .cfi_undefined 1591 - -; WAVE32-NEXT: .cfi_undefined 1600 -; WAVE32-NEXT: .cfi_undefined 1601 -; WAVE32-NEXT: .cfi_undefined 1602 -; WAVE32-NEXT: .cfi_undefined 1603 -; WAVE32-NEXT: .cfi_undefined 1604 -; WAVE32-NEXT: .cfi_undefined 1605 -; WAVE32-NEXT: .cfi_undefined 1606 -; WAVE32-NEXT: .cfi_undefined 1607 - -; WAVE32-NEXT: .cfi_undefined 1616 -; WAVE32-NEXT: .cfi_undefined 1617 -; WAVE32-NEXT: .cfi_undefined 1618 -; WAVE32-NEXT: .cfi_undefined 1619 -; WAVE32-NEXT: .cfi_undefined 1620 -; WAVE32-NEXT: .cfi_undefined 1621 -; WAVE32-NEXT: .cfi_undefined 1622 -; WAVE32-NEXT: .cfi_undefined 1623 - -; WAVE32-NEXT: .cfi_undefined 1632 -; WAVE32-NEXT: .cfi_undefined 1633 -; WAVE32-NEXT: .cfi_undefined 1634 -; WAVE32-NEXT: .cfi_undefined 1635 -; WAVE32-NEXT: .cfi_undefined 1636 -; WAVE32-NEXT: .cfi_undefined 1637 -; WAVE32-NEXT: .cfi_undefined 1638 -; WAVE32-NEXT: .cfi_undefined 1639 - -; WAVE32-NEXT: .cfi_undefined 1648 -; WAVE32-NEXT: .cfi_undefined 1649 -; WAVE32-NEXT: .cfi_undefined 1650 -; WAVE32-NEXT: .cfi_undefined 1651 -; WAVE32-NEXT: .cfi_undefined 1652 -; WAVE32-NEXT: .cfi_undefined 1653 -; WAVE32-NEXT: .cfi_undefined 1654 -; WAVE32-NEXT: .cfi_undefined 1655 - -; WAVE32-NEXT: .cfi_undefined 1664 -; WAVE32-NEXT: .cfi_undefined 1665 -; WAVE32-NEXT: .cfi_undefined 1666 -; WAVE32-NEXT: .cfi_undefined 1667 -; WAVE32-NEXT: .cfi_undefined 1668 -; WAVE32-NEXT: .cfi_undefined 1669 -; WAVE32-NEXT: .cfi_undefined 1670 -; WAVE32-NEXT: .cfi_undefined 1671 - -; WAVE32-NEXT: .cfi_undefined 1680 -; WAVE32-NEXT: .cfi_undefined 1681 -; WAVE32-NEXT: .cfi_undefined 1682 -; WAVE32-NEXT: .cfi_undefined 1683 -; WAVE32-NEXT: .cfi_undefined 1684 -; WAVE32-NEXT: .cfi_undefined 1685 -; WAVE32-NEXT: .cfi_undefined 1686 -; WAVE32-NEXT: .cfi_undefined 1687 - -; WAVE32-NEXT: .cfi_undefined 1696 -; WAVE32-NEXT: .cfi_undefined 1697 -; WAVE32-NEXT: .cfi_undefined 1698 -; WAVE32-NEXT: .cfi_undefined 1699 -; WAVE32-NEXT: .cfi_undefined 1700 -; WAVE32-NEXT: .cfi_undefined 1701 -; WAVE32-NEXT: .cfi_undefined 1702 -; WAVE32-NEXT: .cfi_undefined 1703 - -; WAVE32-NEXT: .cfi_undefined 1712 -; WAVE32-NEXT: .cfi_undefined 1713 -; WAVE32-NEXT: .cfi_undefined 1714 -; WAVE32-NEXT: .cfi_undefined 1715 -; WAVE32-NEXT: .cfi_undefined 1716 -; WAVE32-NEXT: .cfi_undefined 1717 -; WAVE32-NEXT: .cfi_undefined 1718 -; WAVE32-NEXT: .cfi_undefined 1719 - -; WAVE32-NEXT: .cfi_undefined 1728 -; WAVE32-NEXT: .cfi_undefined 1729 -; WAVE32-NEXT: .cfi_undefined 1730 -; WAVE32-NEXT: .cfi_undefined 1731 -; WAVE32-NEXT: .cfi_undefined 1732 -; WAVE32-NEXT: .cfi_undefined 1733 -; WAVE32-NEXT: .cfi_undefined 1734 -; WAVE32-NEXT: .cfi_undefined 1735 - -; WAVE32-NEXT: .cfi_undefined 1744 -; WAVE32-NEXT: .cfi_undefined 1745 -; WAVE32-NEXT: .cfi_undefined 1746 -; WAVE32-NEXT: .cfi_undefined 1747 -; WAVE32-NEXT: .cfi_undefined 1748 -; WAVE32-NEXT: .cfi_undefined 1749 -; WAVE32-NEXT: .cfi_undefined 1750 -; WAVE32-NEXT: .cfi_undefined 1751 - -; WAVE32-NEXT: .cfi_undefined 1760 -; WAVE32-NEXT: .cfi_undefined 1761 -; WAVE32-NEXT: .cfi_undefined 1762 -; WAVE32-NEXT: .cfi_undefined 1763 -; WAVE32-NEXT: .cfi_undefined 1764 -; WAVE32-NEXT: .cfi_undefined 1765 -; WAVE32-NEXT: .cfi_undefined 1766 -; WAVE32-NEXT: .cfi_undefined 1767 - -; WAVE32-NEXT: .cfi_undefined 1776 -; WAVE32-NEXT: .cfi_undefined 1777 -; WAVE32-NEXT: .cfi_undefined 1778 -; WAVE32-NEXT: .cfi_undefined 1779 -; WAVE32-NEXT: .cfi_undefined 1780 -; WAVE32-NEXT: .cfi_undefined 1781 -; WAVE32-NEXT: .cfi_undefined 1782 -; WAVE32-NEXT: .cfi_undefined 1783 - - -; SGPR0 = 32 -; CHECK-NEXT: .cfi_undefined 32 -; CHECK-NEXT: .cfi_undefined 33 -; CHECK-NEXT: .cfi_undefined 34 -; CHECK-NEXT: .cfi_undefined 35 -; CHECK-NEXT: .cfi_undefined 36 -; CHECK-NEXT: .cfi_undefined 37 -; CHECK-NEXT: .cfi_undefined 38 -; CHECK-NEXT: .cfi_undefined 39 -; CHECK-NEXT: .cfi_undefined 40 -; CHECK-NEXT: .cfi_undefined 41 -; CHECK-NEXT: .cfi_undefined 42 -; CHECK-NEXT: .cfi_undefined 43 -; CHECK-NEXT: .cfi_undefined 44 -; CHECK-NEXT: .cfi_undefined 45 -; CHECK-NEXT: .cfi_undefined 46 -; CHECK-NEXT: .cfi_undefined 47 -; CHECK-NEXT: .cfi_undefined 48 -; CHECK-NEXT: .cfi_undefined 49 -; CHECK-NEXT: .cfi_undefined 50 -; CHECK-NEXT: .cfi_undefined 51 -; CHECK-NEXT: .cfi_undefined 52 -; CHECK-NEXT: .cfi_undefined 53 -; CHECK-NEXT: .cfi_undefined 54 -; CHECK-NEXT: .cfi_undefined 55 -; CHECK-NEXT: .cfi_undefined 56 -; CHECK-NEXT: .cfi_undefined 57 -; CHECK-NEXT: .cfi_undefined 58 -; CHECK-NEXT: .cfi_undefined 59 -; CHECK-NEXT: .cfi_undefined 60 -; CHECK-NEXT: .cfi_undefined 61 - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; CHECK: s_mov_b32 s33, s32 -; WAVE64: s_or_saveexec_b64 [[EXEC_MASK:s\[[0-9]+:[0-9]+\]]], -1 -; WAVE32: s_or_saveexec_b32 [[EXEC_MASK:s[0-9]+]], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; VGPR40_wave64 = 2600 -; WAVE64-NEXT: .cfi_offset 2600, 0 -; VGPR40_wave32 = 1576 -; WAVE32-NEXT: .cfi_offset 1576, 0 -; WAVE64: s_mov_b64 exec, [[EXEC_MASK]] -; WAVE32: s_mov_b32 exec_lo, [[EXEC_MASK]] - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2 -; WAVE64-NEXT: .cfi_llvm_vector_registers 65, 2600, 2, 32 -; WAVE32-NEXT: .cfi_llvm_vector_registers 65, 1576, 2, 32 - -; CHECK-NOT: .cfi_{{.*}} - -; SGPR33 = 65 -; CHECK-NEXT: .cfi_def_cfa_register 65 - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: s_addk_i32 s32, -; CHECK: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2 -; SGPR32 = 64 -; CHECK: .cfi_def_cfa_register 64 -; CHECK-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: .cfi_endproc define hidden void @func_call_clobber() #0 { +; GFX900-LABEL: func_call_clobber: +; GFX900: .Lfunc_begin3: +; GFX900-NEXT: .cfi_startproc +; GFX900-NEXT: ; %bb.0: ; %entry +; GFX900-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GFX900-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; GFX900-NEXT: .cfi_undefined 2560 +; GFX900-NEXT: .cfi_undefined 2561 +; GFX900-NEXT: .cfi_undefined 2562 +; GFX900-NEXT: .cfi_undefined 2563 +; GFX900-NEXT: .cfi_undefined 2564 +; GFX900-NEXT: .cfi_undefined 2565 +; GFX900-NEXT: .cfi_undefined 2566 +; GFX900-NEXT: .cfi_undefined 2567 +; GFX900-NEXT: .cfi_undefined 2568 +; GFX900-NEXT: .cfi_undefined 2569 +; GFX900-NEXT: .cfi_undefined 2570 +; GFX900-NEXT: .cfi_undefined 2571 +; GFX900-NEXT: .cfi_undefined 2572 +; GFX900-NEXT: .cfi_undefined 2573 +; GFX900-NEXT: .cfi_undefined 2574 +; GFX900-NEXT: .cfi_undefined 2575 +; GFX900-NEXT: .cfi_undefined 2576 +; GFX900-NEXT: .cfi_undefined 2577 +; GFX900-NEXT: .cfi_undefined 2578 +; GFX900-NEXT: .cfi_undefined 2579 +; GFX900-NEXT: .cfi_undefined 2580 +; GFX900-NEXT: .cfi_undefined 2581 +; GFX900-NEXT: .cfi_undefined 2582 +; GFX900-NEXT: .cfi_undefined 2583 +; GFX900-NEXT: .cfi_undefined 2584 +; GFX900-NEXT: .cfi_undefined 2585 +; GFX900-NEXT: .cfi_undefined 2586 +; GFX900-NEXT: .cfi_undefined 2587 +; GFX900-NEXT: .cfi_undefined 2588 +; GFX900-NEXT: .cfi_undefined 2589 +; GFX900-NEXT: .cfi_undefined 2590 +; GFX900-NEXT: .cfi_undefined 2591 +; GFX900-NEXT: .cfi_undefined 2592 +; GFX900-NEXT: .cfi_undefined 2593 +; GFX900-NEXT: .cfi_undefined 2594 +; GFX900-NEXT: .cfi_undefined 2595 +; GFX900-NEXT: .cfi_undefined 2596 +; GFX900-NEXT: .cfi_undefined 2597 +; GFX900-NEXT: .cfi_undefined 2598 +; GFX900-NEXT: .cfi_undefined 2599 +; GFX900-NEXT: .cfi_undefined 2608 +; GFX900-NEXT: .cfi_undefined 2609 +; GFX900-NEXT: .cfi_undefined 2610 +; GFX900-NEXT: .cfi_undefined 2611 +; GFX900-NEXT: .cfi_undefined 2612 +; GFX900-NEXT: .cfi_undefined 2613 +; GFX900-NEXT: .cfi_undefined 2614 +; GFX900-NEXT: .cfi_undefined 2615 +; GFX900-NEXT: .cfi_undefined 2624 +; GFX900-NEXT: .cfi_undefined 2625 +; GFX900-NEXT: .cfi_undefined 2626 +; GFX900-NEXT: .cfi_undefined 2627 +; GFX900-NEXT: .cfi_undefined 2628 +; GFX900-NEXT: .cfi_undefined 2629 +; GFX900-NEXT: .cfi_undefined 2630 +; GFX900-NEXT: .cfi_undefined 2631 +; GFX900-NEXT: .cfi_undefined 2640 +; GFX900-NEXT: .cfi_undefined 2641 +; GFX900-NEXT: .cfi_undefined 2642 +; GFX900-NEXT: .cfi_undefined 2643 +; GFX900-NEXT: .cfi_undefined 2644 +; GFX900-NEXT: .cfi_undefined 2645 +; GFX900-NEXT: .cfi_undefined 2646 +; GFX900-NEXT: .cfi_undefined 2647 +; GFX900-NEXT: .cfi_undefined 2656 +; GFX900-NEXT: .cfi_undefined 2657 +; GFX900-NEXT: .cfi_undefined 2658 +; GFX900-NEXT: .cfi_undefined 2659 +; GFX900-NEXT: .cfi_undefined 2660 +; GFX900-NEXT: .cfi_undefined 2661 +; GFX900-NEXT: .cfi_undefined 2662 +; GFX900-NEXT: .cfi_undefined 2663 +; GFX900-NEXT: .cfi_undefined 2672 +; GFX900-NEXT: .cfi_undefined 2673 +; GFX900-NEXT: .cfi_undefined 2674 +; GFX900-NEXT: .cfi_undefined 2675 +; GFX900-NEXT: .cfi_undefined 2676 +; GFX900-NEXT: .cfi_undefined 2677 +; GFX900-NEXT: .cfi_undefined 2678 +; GFX900-NEXT: .cfi_undefined 2679 +; GFX900-NEXT: .cfi_undefined 2688 +; GFX900-NEXT: .cfi_undefined 2689 +; GFX900-NEXT: .cfi_undefined 2690 +; GFX900-NEXT: .cfi_undefined 2691 +; GFX900-NEXT: .cfi_undefined 2692 +; GFX900-NEXT: .cfi_undefined 2693 +; GFX900-NEXT: .cfi_undefined 2694 +; GFX900-NEXT: .cfi_undefined 2695 +; GFX900-NEXT: .cfi_undefined 2704 +; GFX900-NEXT: .cfi_undefined 2705 +; GFX900-NEXT: .cfi_undefined 2706 +; GFX900-NEXT: .cfi_undefined 2707 +; GFX900-NEXT: .cfi_undefined 2708 +; GFX900-NEXT: .cfi_undefined 2709 +; GFX900-NEXT: .cfi_undefined 2710 +; GFX900-NEXT: .cfi_undefined 2711 +; GFX900-NEXT: .cfi_undefined 2720 +; GFX900-NEXT: .cfi_undefined 2721 +; GFX900-NEXT: .cfi_undefined 2722 +; GFX900-NEXT: .cfi_undefined 2723 +; GFX900-NEXT: .cfi_undefined 2724 +; GFX900-NEXT: .cfi_undefined 2725 +; GFX900-NEXT: .cfi_undefined 2726 +; GFX900-NEXT: .cfi_undefined 2727 +; GFX900-NEXT: .cfi_undefined 2736 +; GFX900-NEXT: .cfi_undefined 2737 +; GFX900-NEXT: .cfi_undefined 2738 +; GFX900-NEXT: .cfi_undefined 2739 +; GFX900-NEXT: .cfi_undefined 2740 +; GFX900-NEXT: .cfi_undefined 2741 +; GFX900-NEXT: .cfi_undefined 2742 +; GFX900-NEXT: .cfi_undefined 2743 +; GFX900-NEXT: .cfi_undefined 2752 +; GFX900-NEXT: .cfi_undefined 2753 +; GFX900-NEXT: .cfi_undefined 2754 +; GFX900-NEXT: .cfi_undefined 2755 +; GFX900-NEXT: .cfi_undefined 2756 +; GFX900-NEXT: .cfi_undefined 2757 +; GFX900-NEXT: .cfi_undefined 2758 +; GFX900-NEXT: .cfi_undefined 2759 +; GFX900-NEXT: .cfi_undefined 2768 +; GFX900-NEXT: .cfi_undefined 2769 +; GFX900-NEXT: .cfi_undefined 2770 +; GFX900-NEXT: .cfi_undefined 2771 +; GFX900-NEXT: .cfi_undefined 2772 +; GFX900-NEXT: .cfi_undefined 2773 +; GFX900-NEXT: .cfi_undefined 2774 +; GFX900-NEXT: .cfi_undefined 2775 +; GFX900-NEXT: .cfi_undefined 2784 +; GFX900-NEXT: .cfi_undefined 2785 +; GFX900-NEXT: .cfi_undefined 2786 +; GFX900-NEXT: .cfi_undefined 2787 +; GFX900-NEXT: .cfi_undefined 2788 +; GFX900-NEXT: .cfi_undefined 2789 +; GFX900-NEXT: .cfi_undefined 2790 +; GFX900-NEXT: .cfi_undefined 2791 +; GFX900-NEXT: .cfi_undefined 2800 +; GFX900-NEXT: .cfi_undefined 2801 +; GFX900-NEXT: .cfi_undefined 2802 +; GFX900-NEXT: .cfi_undefined 2803 +; GFX900-NEXT: .cfi_undefined 2804 +; GFX900-NEXT: .cfi_undefined 2805 +; GFX900-NEXT: .cfi_undefined 2806 +; GFX900-NEXT: .cfi_undefined 2807 +; GFX900-NEXT: .cfi_undefined 32 +; GFX900-NEXT: .cfi_undefined 33 +; GFX900-NEXT: .cfi_undefined 34 +; GFX900-NEXT: .cfi_undefined 35 +; GFX900-NEXT: .cfi_undefined 36 +; GFX900-NEXT: .cfi_undefined 37 +; GFX900-NEXT: .cfi_undefined 38 +; GFX900-NEXT: .cfi_undefined 39 +; GFX900-NEXT: .cfi_undefined 40 +; GFX900-NEXT: .cfi_undefined 41 +; GFX900-NEXT: .cfi_undefined 42 +; GFX900-NEXT: .cfi_undefined 43 +; GFX900-NEXT: .cfi_undefined 44 +; GFX900-NEXT: .cfi_undefined 45 +; GFX900-NEXT: .cfi_undefined 46 +; GFX900-NEXT: .cfi_undefined 47 +; GFX900-NEXT: .cfi_undefined 48 +; GFX900-NEXT: .cfi_undefined 49 +; GFX900-NEXT: .cfi_undefined 50 +; GFX900-NEXT: .cfi_undefined 51 +; GFX900-NEXT: .cfi_undefined 52 +; GFX900-NEXT: .cfi_undefined 53 +; GFX900-NEXT: .cfi_undefined 54 +; GFX900-NEXT: .cfi_undefined 55 +; GFX900-NEXT: .cfi_undefined 56 +; GFX900-NEXT: .cfi_undefined 57 +; GFX900-NEXT: .cfi_undefined 58 +; GFX900-NEXT: .cfi_undefined 59 +; GFX900-NEXT: .cfi_undefined 60 +; GFX900-NEXT: .cfi_undefined 61 +; GFX900-NEXT: .cfi_undefined 72 +; GFX900-NEXT: .cfi_undefined 73 +; GFX900-NEXT: .cfi_undefined 74 +; GFX900-NEXT: .cfi_undefined 75 +; GFX900-NEXT: .cfi_undefined 76 +; GFX900-NEXT: .cfi_undefined 77 +; GFX900-NEXT: .cfi_undefined 78 +; GFX900-NEXT: .cfi_undefined 79 +; GFX900-NEXT: .cfi_undefined 88 +; GFX900-NEXT: .cfi_undefined 89 +; GFX900-NEXT: .cfi_undefined 90 +; GFX900-NEXT: .cfi_undefined 91 +; GFX900-NEXT: .cfi_undefined 92 +; GFX900-NEXT: .cfi_undefined 93 +; GFX900-NEXT: .cfi_undefined 94 +; GFX900-NEXT: .cfi_undefined 95 +; GFX900-NEXT: .cfi_undefined 1096 +; GFX900-NEXT: .cfi_undefined 1097 +; GFX900-NEXT: .cfi_undefined 1098 +; GFX900-NEXT: .cfi_undefined 1099 +; GFX900-NEXT: .cfi_undefined 1100 +; GFX900-NEXT: .cfi_undefined 1101 +; GFX900-NEXT: .cfi_undefined 1102 +; GFX900-NEXT: .cfi_undefined 1103 +; GFX900-NEXT: .cfi_undefined 1112 +; GFX900-NEXT: .cfi_undefined 1113 +; GFX900-NEXT: .cfi_undefined 1114 +; GFX900-NEXT: .cfi_undefined 1115 +; GFX900-NEXT: .cfi_undefined 1116 +; GFX900-NEXT: .cfi_undefined 1117 +; GFX900-NEXT: .cfi_undefined 1118 +; GFX900-NEXT: .cfi_undefined 1119 +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s16, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_offset 2600, 0 +; GFX900-NEXT: s_mov_b64 exec, s[18:19] +; GFX900-NEXT: v_writelane_b32 v40, s16, 2 +; GFX900-NEXT: .cfi_llvm_vector_registers 65, 2600, 2, 32 +; GFX900-NEXT: .cfi_def_cfa_register 65 +; GFX900-NEXT: v_writelane_b32 v40, s30, 0 +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: v_writelane_b32 v40, s31, 1 +; GFX900-NEXT: .cfi_llvm_vector_registers 16, 2600, 0, 32, 2600, 1, 32 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: v_readlane_b32 s30, v40, 0 +; GFX900-NEXT: v_readlane_b32 s31, v40, 1 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: v_readlane_b32 s4, v40, 2 +; GFX900-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX900-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[6:7] +; GFX900-NEXT: .cfi_def_cfa_register 64 +; GFX900-NEXT: s_mov_b32 s33, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-V2A-DIS-LABEL: func_call_clobber: +; GFX90A-V2A-DIS: .Lfunc_begin3: +; GFX90A-V2A-DIS-NEXT: .cfi_startproc +; GFX90A-V2A-DIS-NEXT: ; %bb.0: ; %entry +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2560 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2561 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2562 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2563 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2564 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2565 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2566 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2567 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2568 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2569 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2570 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2571 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2572 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2573 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2574 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2575 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2576 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2577 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2578 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2579 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2580 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2581 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2582 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2583 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2584 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2585 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2586 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2587 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2588 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2589 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2590 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2591 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2592 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2593 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2594 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2595 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2596 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2597 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2598 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2599 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2608 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2609 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2610 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2611 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2612 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2613 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2614 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2615 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2624 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2625 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2626 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2627 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2628 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2629 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2630 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2631 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2640 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2641 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2642 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2643 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2644 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2645 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2646 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2647 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2656 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2657 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2658 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2659 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2660 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2661 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2662 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2663 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2672 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2673 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2674 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2675 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2676 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2677 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2678 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2679 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2688 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2689 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2690 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2691 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2692 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2693 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2694 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2695 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2704 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2705 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2706 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2707 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2708 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2709 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2710 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2711 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2720 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2721 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2722 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2723 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2724 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2725 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2726 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2727 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2736 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2737 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2738 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2739 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2740 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2741 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2742 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2743 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2752 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2753 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2754 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2755 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2756 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2757 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2758 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2759 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2768 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2769 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2770 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2771 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2772 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2773 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2774 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2775 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2784 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2785 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2786 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2787 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2788 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2789 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2790 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2791 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2800 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2801 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2802 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2803 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2804 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2805 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2806 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 2807 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3072 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3073 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3074 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3075 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3076 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3077 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3078 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3079 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3080 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3081 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3082 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3083 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3084 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3085 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3086 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3087 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3088 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3089 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3090 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3091 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3092 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3093 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3094 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3095 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3096 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3097 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3098 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3099 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3100 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3101 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3102 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 3103 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 32 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 33 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 34 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 35 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 36 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 37 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 38 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 39 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 40 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 41 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 42 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 43 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 44 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 45 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 46 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 47 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 48 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 49 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 50 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 51 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 52 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 53 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 54 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 55 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 56 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 57 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 58 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 59 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 60 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 61 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 72 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 73 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 74 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 75 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 76 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 77 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 78 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 79 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 88 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 89 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 90 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 91 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 92 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 93 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 94 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 95 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1096 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1097 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1098 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1099 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1100 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1101 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1102 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1103 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1112 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1113 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1114 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1115 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1116 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1117 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1118 +; GFX90A-V2A-DIS-NEXT: .cfi_undefined 1119 +; GFX90A-V2A-DIS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-V2A-DIS-NEXT: s_mov_b32 s16, s33 +; GFX90A-V2A-DIS-NEXT: s_mov_b32 s33, s32 +; GFX90A-V2A-DIS-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_offset 2600, 0 +; GFX90A-V2A-DIS-NEXT: s_mov_b64 exec, s[18:19] +; GFX90A-V2A-DIS-NEXT: v_writelane_b32 v40, s16, 2 +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_registers 65, 2600, 2, 32 +; GFX90A-V2A-DIS-NEXT: .cfi_def_cfa_register 65 +; GFX90A-V2A-DIS-NEXT: v_writelane_b32 v40, s30, 0 +; GFX90A-V2A-DIS-NEXT: s_addk_i32 s32, 0x400 +; GFX90A-V2A-DIS-NEXT: v_writelane_b32 v40, s31, 1 +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_registers 16, 2600, 0, 32, 2600, 1, 32 +; GFX90A-V2A-DIS-NEXT: s_getpc_b64 s[16:17] +; GFX90A-V2A-DIS-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX90A-V2A-DIS-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 +; GFX90A-V2A-DIS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s30, v40, 0 +; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s31, v40, 1 +; GFX90A-V2A-DIS-NEXT: s_mov_b32 s32, s33 +; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s4, v40, 2 +; GFX90A-V2A-DIS-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-V2A-DIS-NEXT: .cfi_def_cfa_register 64 +; GFX90A-V2A-DIS-NEXT: s_mov_b32 s33, s4 +; GFX90A-V2A-DIS-NEXT: s_waitcnt vmcnt(0) +; GFX90A-V2A-DIS-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-V2A-EN-LABEL: func_call_clobber: +; GFX90A-V2A-EN: .Lfunc_begin3: +; GFX90A-V2A-EN-NEXT: .cfi_startproc +; GFX90A-V2A-EN-NEXT: ; %bb.0: ; %entry +; GFX90A-V2A-EN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GFX90A-V2A-EN-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2560 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2561 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2562 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2563 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2564 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2565 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2566 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2567 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2568 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2569 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2570 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2571 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2572 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2573 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2574 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2575 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2576 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2577 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2578 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2579 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2580 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2581 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2582 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2583 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2584 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2585 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2586 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2587 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2588 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2589 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2590 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2591 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2592 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2593 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2594 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2595 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2596 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2597 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2598 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2599 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2608 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2609 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2610 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2611 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2612 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2613 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2614 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2615 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2624 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2625 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2626 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2627 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2628 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2629 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2630 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2631 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2640 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2641 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2642 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2643 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2644 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2645 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2646 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2647 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2656 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2657 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2658 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2659 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2660 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2661 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2662 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2663 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2672 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2673 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2674 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2675 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2676 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2677 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2678 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2679 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2688 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2689 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2690 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2691 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2692 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2693 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2694 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2695 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2704 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2705 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2706 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2707 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2708 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2709 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2710 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2711 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2720 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2721 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2722 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2723 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2724 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2725 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2726 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2727 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2736 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2737 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2738 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2739 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2740 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2741 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2742 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2743 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2752 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2753 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2754 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2755 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2756 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2757 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2758 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2759 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2768 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2769 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2770 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2771 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2772 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2773 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2774 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2775 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2784 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2785 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2786 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2787 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2788 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2789 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2790 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2791 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2800 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2801 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2802 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2803 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2804 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2805 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2806 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2807 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3072 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3073 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3074 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3075 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3076 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3077 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3078 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3079 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3080 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3081 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3082 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3083 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3084 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3085 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3086 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3087 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3088 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3089 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3090 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3091 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3092 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3093 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3094 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3095 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3096 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3097 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3098 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3099 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3100 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3101 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3102 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3103 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 32 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 33 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 34 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 35 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 36 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 37 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 38 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 39 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 40 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 41 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 42 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 43 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 44 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 45 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 46 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 47 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 48 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 49 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 50 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 51 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 52 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 53 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 54 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 55 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 56 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 57 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 58 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 59 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 60 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 61 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 72 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 73 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 74 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 75 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 76 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 77 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 78 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 79 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 88 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 89 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 90 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 91 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 92 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 93 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 94 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 95 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1096 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1097 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1098 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1099 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1100 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1101 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1102 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1103 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1112 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1113 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1114 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1115 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1116 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1117 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1118 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 1119 +; GFX90A-V2A-EN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-V2A-EN-NEXT: s_mov_b32 s16, s33 +; GFX90A-V2A-EN-NEXT: s_mov_b32 s33, s32 +; GFX90A-V2A-EN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX90A-V2A-EN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX90A-V2A-EN-NEXT: .cfi_offset 2600, 0 +; GFX90A-V2A-EN-NEXT: s_mov_b64 exec, s[18:19] +; GFX90A-V2A-EN-NEXT: v_writelane_b32 v40, s16, 2 +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_registers 65, 2600, 2, 32 +; GFX90A-V2A-EN-NEXT: .cfi_def_cfa_register 65 +; GFX90A-V2A-EN-NEXT: v_writelane_b32 v40, s30, 0 +; GFX90A-V2A-EN-NEXT: s_addk_i32 s32, 0x400 +; GFX90A-V2A-EN-NEXT: v_writelane_b32 v40, s31, 1 +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_registers 16, 2600, 0, 32, 2600, 1, 32 +; GFX90A-V2A-EN-NEXT: s_getpc_b64 s[16:17] +; GFX90A-V2A-EN-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX90A-V2A-EN-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 +; GFX90A-V2A-EN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX90A-V2A-EN-NEXT: v_readlane_b32 s30, v40, 0 +; GFX90A-V2A-EN-NEXT: v_readlane_b32 s31, v40, 1 +; GFX90A-V2A-EN-NEXT: s_mov_b32 s32, s33 +; GFX90A-V2A-EN-NEXT: v_readlane_b32 s4, v40, 2 +; GFX90A-V2A-EN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX90A-V2A-EN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX90A-V2A-EN-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-V2A-EN-NEXT: .cfi_def_cfa_register 64 +; GFX90A-V2A-EN-NEXT: s_mov_b32 s33, s4 +; GFX90A-V2A-EN-NEXT: s_waitcnt vmcnt(0) +; GFX90A-V2A-EN-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: func_call_clobber: +; WAVE32: .Lfunc_begin3: +; WAVE32-NEXT: .cfi_startproc +; WAVE32-NEXT: ; %bb.0: ; %entry +; WAVE32-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE32-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE32-NEXT: .cfi_undefined 1536 +; WAVE32-NEXT: .cfi_undefined 1537 +; WAVE32-NEXT: .cfi_undefined 1538 +; WAVE32-NEXT: .cfi_undefined 1539 +; WAVE32-NEXT: .cfi_undefined 1540 +; WAVE32-NEXT: .cfi_undefined 1541 +; WAVE32-NEXT: .cfi_undefined 1542 +; WAVE32-NEXT: .cfi_undefined 1543 +; WAVE32-NEXT: .cfi_undefined 1544 +; WAVE32-NEXT: .cfi_undefined 1545 +; WAVE32-NEXT: .cfi_undefined 1546 +; WAVE32-NEXT: .cfi_undefined 1547 +; WAVE32-NEXT: .cfi_undefined 1548 +; WAVE32-NEXT: .cfi_undefined 1549 +; WAVE32-NEXT: .cfi_undefined 1550 +; WAVE32-NEXT: .cfi_undefined 1551 +; WAVE32-NEXT: .cfi_undefined 1552 +; WAVE32-NEXT: .cfi_undefined 1553 +; WAVE32-NEXT: .cfi_undefined 1554 +; WAVE32-NEXT: .cfi_undefined 1555 +; WAVE32-NEXT: .cfi_undefined 1556 +; WAVE32-NEXT: .cfi_undefined 1557 +; WAVE32-NEXT: .cfi_undefined 1558 +; WAVE32-NEXT: .cfi_undefined 1559 +; WAVE32-NEXT: .cfi_undefined 1560 +; WAVE32-NEXT: .cfi_undefined 1561 +; WAVE32-NEXT: .cfi_undefined 1562 +; WAVE32-NEXT: .cfi_undefined 1563 +; WAVE32-NEXT: .cfi_undefined 1564 +; WAVE32-NEXT: .cfi_undefined 1565 +; WAVE32-NEXT: .cfi_undefined 1566 +; WAVE32-NEXT: .cfi_undefined 1567 +; WAVE32-NEXT: .cfi_undefined 1568 +; WAVE32-NEXT: .cfi_undefined 1569 +; WAVE32-NEXT: .cfi_undefined 1570 +; WAVE32-NEXT: .cfi_undefined 1571 +; WAVE32-NEXT: .cfi_undefined 1572 +; WAVE32-NEXT: .cfi_undefined 1573 +; WAVE32-NEXT: .cfi_undefined 1574 +; WAVE32-NEXT: .cfi_undefined 1575 +; WAVE32-NEXT: .cfi_undefined 1584 +; WAVE32-NEXT: .cfi_undefined 1585 +; WAVE32-NEXT: .cfi_undefined 1586 +; WAVE32-NEXT: .cfi_undefined 1587 +; WAVE32-NEXT: .cfi_undefined 1588 +; WAVE32-NEXT: .cfi_undefined 1589 +; WAVE32-NEXT: .cfi_undefined 1590 +; WAVE32-NEXT: .cfi_undefined 1591 +; WAVE32-NEXT: .cfi_undefined 1600 +; WAVE32-NEXT: .cfi_undefined 1601 +; WAVE32-NEXT: .cfi_undefined 1602 +; WAVE32-NEXT: .cfi_undefined 1603 +; WAVE32-NEXT: .cfi_undefined 1604 +; WAVE32-NEXT: .cfi_undefined 1605 +; WAVE32-NEXT: .cfi_undefined 1606 +; WAVE32-NEXT: .cfi_undefined 1607 +; WAVE32-NEXT: .cfi_undefined 1616 +; WAVE32-NEXT: .cfi_undefined 1617 +; WAVE32-NEXT: .cfi_undefined 1618 +; WAVE32-NEXT: .cfi_undefined 1619 +; WAVE32-NEXT: .cfi_undefined 1620 +; WAVE32-NEXT: .cfi_undefined 1621 +; WAVE32-NEXT: .cfi_undefined 1622 +; WAVE32-NEXT: .cfi_undefined 1623 +; WAVE32-NEXT: .cfi_undefined 1632 +; WAVE32-NEXT: .cfi_undefined 1633 +; WAVE32-NEXT: .cfi_undefined 1634 +; WAVE32-NEXT: .cfi_undefined 1635 +; WAVE32-NEXT: .cfi_undefined 1636 +; WAVE32-NEXT: .cfi_undefined 1637 +; WAVE32-NEXT: .cfi_undefined 1638 +; WAVE32-NEXT: .cfi_undefined 1639 +; WAVE32-NEXT: .cfi_undefined 1648 +; WAVE32-NEXT: .cfi_undefined 1649 +; WAVE32-NEXT: .cfi_undefined 1650 +; WAVE32-NEXT: .cfi_undefined 1651 +; WAVE32-NEXT: .cfi_undefined 1652 +; WAVE32-NEXT: .cfi_undefined 1653 +; WAVE32-NEXT: .cfi_undefined 1654 +; WAVE32-NEXT: .cfi_undefined 1655 +; WAVE32-NEXT: .cfi_undefined 1664 +; WAVE32-NEXT: .cfi_undefined 1665 +; WAVE32-NEXT: .cfi_undefined 1666 +; WAVE32-NEXT: .cfi_undefined 1667 +; WAVE32-NEXT: .cfi_undefined 1668 +; WAVE32-NEXT: .cfi_undefined 1669 +; WAVE32-NEXT: .cfi_undefined 1670 +; WAVE32-NEXT: .cfi_undefined 1671 +; WAVE32-NEXT: .cfi_undefined 1680 +; WAVE32-NEXT: .cfi_undefined 1681 +; WAVE32-NEXT: .cfi_undefined 1682 +; WAVE32-NEXT: .cfi_undefined 1683 +; WAVE32-NEXT: .cfi_undefined 1684 +; WAVE32-NEXT: .cfi_undefined 1685 +; WAVE32-NEXT: .cfi_undefined 1686 +; WAVE32-NEXT: .cfi_undefined 1687 +; WAVE32-NEXT: .cfi_undefined 1696 +; WAVE32-NEXT: .cfi_undefined 1697 +; WAVE32-NEXT: .cfi_undefined 1698 +; WAVE32-NEXT: .cfi_undefined 1699 +; WAVE32-NEXT: .cfi_undefined 1700 +; WAVE32-NEXT: .cfi_undefined 1701 +; WAVE32-NEXT: .cfi_undefined 1702 +; WAVE32-NEXT: .cfi_undefined 1703 +; WAVE32-NEXT: .cfi_undefined 1712 +; WAVE32-NEXT: .cfi_undefined 1713 +; WAVE32-NEXT: .cfi_undefined 1714 +; WAVE32-NEXT: .cfi_undefined 1715 +; WAVE32-NEXT: .cfi_undefined 1716 +; WAVE32-NEXT: .cfi_undefined 1717 +; WAVE32-NEXT: .cfi_undefined 1718 +; WAVE32-NEXT: .cfi_undefined 1719 +; WAVE32-NEXT: .cfi_undefined 1728 +; WAVE32-NEXT: .cfi_undefined 1729 +; WAVE32-NEXT: .cfi_undefined 1730 +; WAVE32-NEXT: .cfi_undefined 1731 +; WAVE32-NEXT: .cfi_undefined 1732 +; WAVE32-NEXT: .cfi_undefined 1733 +; WAVE32-NEXT: .cfi_undefined 1734 +; WAVE32-NEXT: .cfi_undefined 1735 +; WAVE32-NEXT: .cfi_undefined 1744 +; WAVE32-NEXT: .cfi_undefined 1745 +; WAVE32-NEXT: .cfi_undefined 1746 +; WAVE32-NEXT: .cfi_undefined 1747 +; WAVE32-NEXT: .cfi_undefined 1748 +; WAVE32-NEXT: .cfi_undefined 1749 +; WAVE32-NEXT: .cfi_undefined 1750 +; WAVE32-NEXT: .cfi_undefined 1751 +; WAVE32-NEXT: .cfi_undefined 1760 +; WAVE32-NEXT: .cfi_undefined 1761 +; WAVE32-NEXT: .cfi_undefined 1762 +; WAVE32-NEXT: .cfi_undefined 1763 +; WAVE32-NEXT: .cfi_undefined 1764 +; WAVE32-NEXT: .cfi_undefined 1765 +; WAVE32-NEXT: .cfi_undefined 1766 +; WAVE32-NEXT: .cfi_undefined 1767 +; WAVE32-NEXT: .cfi_undefined 1776 +; WAVE32-NEXT: .cfi_undefined 1777 +; WAVE32-NEXT: .cfi_undefined 1778 +; WAVE32-NEXT: .cfi_undefined 1779 +; WAVE32-NEXT: .cfi_undefined 1780 +; WAVE32-NEXT: .cfi_undefined 1781 +; WAVE32-NEXT: .cfi_undefined 1782 +; WAVE32-NEXT: .cfi_undefined 1783 +; WAVE32-NEXT: .cfi_undefined 32 +; WAVE32-NEXT: .cfi_undefined 33 +; WAVE32-NEXT: .cfi_undefined 34 +; WAVE32-NEXT: .cfi_undefined 35 +; WAVE32-NEXT: .cfi_undefined 36 +; WAVE32-NEXT: .cfi_undefined 37 +; WAVE32-NEXT: .cfi_undefined 38 +; WAVE32-NEXT: .cfi_undefined 39 +; WAVE32-NEXT: .cfi_undefined 40 +; WAVE32-NEXT: .cfi_undefined 41 +; WAVE32-NEXT: .cfi_undefined 42 +; WAVE32-NEXT: .cfi_undefined 43 +; WAVE32-NEXT: .cfi_undefined 44 +; WAVE32-NEXT: .cfi_undefined 45 +; WAVE32-NEXT: .cfi_undefined 46 +; WAVE32-NEXT: .cfi_undefined 47 +; WAVE32-NEXT: .cfi_undefined 48 +; WAVE32-NEXT: .cfi_undefined 49 +; WAVE32-NEXT: .cfi_undefined 50 +; WAVE32-NEXT: .cfi_undefined 51 +; WAVE32-NEXT: .cfi_undefined 52 +; WAVE32-NEXT: .cfi_undefined 53 +; WAVE32-NEXT: .cfi_undefined 54 +; WAVE32-NEXT: .cfi_undefined 55 +; WAVE32-NEXT: .cfi_undefined 56 +; WAVE32-NEXT: .cfi_undefined 57 +; WAVE32-NEXT: .cfi_undefined 58 +; WAVE32-NEXT: .cfi_undefined 59 +; WAVE32-NEXT: .cfi_undefined 60 +; WAVE32-NEXT: .cfi_undefined 61 +; WAVE32-NEXT: .cfi_undefined 72 +; WAVE32-NEXT: .cfi_undefined 73 +; WAVE32-NEXT: .cfi_undefined 74 +; WAVE32-NEXT: .cfi_undefined 75 +; WAVE32-NEXT: .cfi_undefined 76 +; WAVE32-NEXT: .cfi_undefined 77 +; WAVE32-NEXT: .cfi_undefined 78 +; WAVE32-NEXT: .cfi_undefined 79 +; WAVE32-NEXT: .cfi_undefined 88 +; WAVE32-NEXT: .cfi_undefined 89 +; WAVE32-NEXT: .cfi_undefined 90 +; WAVE32-NEXT: .cfi_undefined 91 +; WAVE32-NEXT: .cfi_undefined 92 +; WAVE32-NEXT: .cfi_undefined 93 +; WAVE32-NEXT: .cfi_undefined 94 +; WAVE32-NEXT: .cfi_undefined 95 +; WAVE32-NEXT: .cfi_undefined 1096 +; WAVE32-NEXT: .cfi_undefined 1097 +; WAVE32-NEXT: .cfi_undefined 1098 +; WAVE32-NEXT: .cfi_undefined 1099 +; WAVE32-NEXT: .cfi_undefined 1100 +; WAVE32-NEXT: .cfi_undefined 1101 +; WAVE32-NEXT: .cfi_undefined 1102 +; WAVE32-NEXT: .cfi_undefined 1103 +; WAVE32-NEXT: .cfi_undefined 1112 +; WAVE32-NEXT: .cfi_undefined 1113 +; WAVE32-NEXT: .cfi_undefined 1114 +; WAVE32-NEXT: .cfi_undefined 1115 +; WAVE32-NEXT: .cfi_undefined 1116 +; WAVE32-NEXT: .cfi_undefined 1117 +; WAVE32-NEXT: .cfi_undefined 1118 +; WAVE32-NEXT: .cfi_undefined 1119 +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: s_mov_b32 s16, s33 +; WAVE32-NEXT: s_mov_b32 s33, s32 +; WAVE32-NEXT: s_or_saveexec_b32 s17, -1 +; WAVE32-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_offset 1576, 0 +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, s17 +; WAVE32-NEXT: v_writelane_b32 v40, s16, 2 +; WAVE32-NEXT: .cfi_llvm_vector_registers 65, 1576, 2, 32 +; WAVE32-NEXT: .cfi_def_cfa_register 65 +; WAVE32-NEXT: v_writelane_b32 v40, s30, 0 +; WAVE32-NEXT: s_addk_i32 s32, 0x200 +; WAVE32-NEXT: v_writelane_b32 v40, s31, 1 +; WAVE32-NEXT: .cfi_llvm_vector_registers 16, 1576, 0, 32, 1576, 1, 32 +; WAVE32-NEXT: s_getpc_b64 s[16:17] +; WAVE32-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; WAVE32-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 +; WAVE32-NEXT: s_swappc_b64 s[30:31], s[16:17] +; WAVE32-NEXT: v_readlane_b32 s30, v40, 0 +; WAVE32-NEXT: v_readlane_b32 s31, v40, 1 +; WAVE32-NEXT: s_mov_b32 s32, s33 +; WAVE32-NEXT: v_readlane_b32 s4, v40, 2 +; WAVE32-NEXT: s_or_saveexec_b32 s5, -1 +; WAVE32-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; WAVE32-NEXT: s_waitcnt_depctr 0xffe3 +; WAVE32-NEXT: s_mov_b32 exec_lo, s5 +; WAVE32-NEXT: .cfi_def_cfa_register 64 +; WAVE32-NEXT: s_mov_b32 s33, s4 +; WAVE32-NEXT: s_waitcnt vmcnt(0) +; WAVE32-NEXT: s_setpc_b64 s[30:31] entry: call void @ex() #0 ret void } -; CHECK-LABEL: func_spill_vgpr_to_vmem: -; CHECK: .cfi_startproc - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: %bb.0: -; SGPR32 = 64 -; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 -; CHECK-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 2560 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 2561 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3072 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3073 - -; CHECK-NOT: .cfi_{{.*}} - -; WAVE32: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX900: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-V2A-DIS: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-V2A-EN: v_accvgpr_write_b32 a[[#TMP_AGPR1:]], v[[#VGPR1:]] - -; GFX900-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 256 -; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 768 -; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask [[#VGPR1+2560]], [[#TMP_AGPR1+3072]], 32, 17, 64 - -; WAVE32-NEXT: .cfi_llvm_vector_offset 1576, 32, 1, 32, 128 - -; CHECK-NOT: .cfi_{{.*}} - -; WAVE32: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX900: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX90A-V2A-DIS: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-V2A-EN: v_accvgpr_write_b32 a[[#TMP_AGPR2:]], v[[#VGPR2:]] - -; GFX900-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 0 -; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 512 -; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask [[#VGPR2+2560]], [[#TMP_AGPR2+3072]], 32, 17, 64 - -; WAVE32: .cfi_llvm_vector_offset 1577, 32, 1, 32, 0 - -; GFX90A-V2A-DIS: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-V2A-EN: v_accvgpr_read_b32 v[[#TMP_VGPR1:]], a[[#AGPR1:]] - -; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 3104, 32, 17, 64, 256 -; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask [[#AGPR1+3072]], [[#TMP_VGPR1+2560]], 32, 17, 64 - -; CHECK-NOT: .cfi_{{.*}} - -; GFX90A-V2A-DIS: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX90A-V2A-EN: v_accvgpr_read_b32 v[[#TMP_VGPR2:]], a[[#AGPR2:]] - -; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 3105, 32, 17, 64, 0 -; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask [[#AGPR2+3072]], [[#TMP_VGPR2+2560]], 32, 17, 64 - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: .cfi_endproc define hidden void @func_spill_vgpr_to_vmem() #0 { +; GFX900-LABEL: func_spill_vgpr_to_vmem: +; GFX900: .Lfunc_begin4: +; GFX900-NEXT: .cfi_startproc +; GFX900-NEXT: ; %bb.0: ; %entry +; GFX900-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GFX900-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 256 +; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; clobber +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; clobber +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; clobber +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; clobber +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-V2A-DIS-LABEL: func_spill_vgpr_to_vmem: +; GFX90A-V2A-DIS: .Lfunc_begin4: +; GFX90A-V2A-DIS-NEXT: .cfi_startproc +; GFX90A-V2A-DIS-NEXT: ; %bb.0: ; %entry +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; GFX90A-V2A-DIS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 768 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 512 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 3104, 32, 17, 64, 256 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 3105, 32, 17, 64, 0 +; GFX90A-V2A-DIS-NEXT: ;;#ASMSTART +; GFX90A-V2A-DIS-NEXT: ; clobber +; GFX90A-V2A-DIS-NEXT: ;;#ASMEND +; GFX90A-V2A-DIS-NEXT: ;;#ASMSTART +; GFX90A-V2A-DIS-NEXT: ; clobber +; GFX90A-V2A-DIS-NEXT: ;;#ASMEND +; GFX90A-V2A-DIS-NEXT: ;;#ASMSTART +; GFX90A-V2A-DIS-NEXT: ; clobber +; GFX90A-V2A-DIS-NEXT: ;;#ASMEND +; GFX90A-V2A-DIS-NEXT: ;;#ASMSTART +; GFX90A-V2A-DIS-NEXT: ; clobber +; GFX90A-V2A-DIS-NEXT: ;;#ASMEND +; GFX90A-V2A-DIS-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: s_waitcnt vmcnt(0) +; GFX90A-V2A-DIS-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-V2A-EN-LABEL: func_spill_vgpr_to_vmem: +; GFX90A-V2A-EN: .Lfunc_begin4: +; GFX90A-V2A-EN-NEXT: .cfi_startproc +; GFX90A-V2A-EN-NEXT: ; %bb.0: ; %entry +; GFX90A-V2A-EN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GFX90A-V2A-EN-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2560 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2561 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3072 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3073 +; GFX90A-V2A-EN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2600, 3072, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2601, 3073, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v0, a32 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 3104, 2560, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v1, a33 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 3105, 2561, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: ;;#ASMSTART +; GFX90A-V2A-EN-NEXT: ; clobber +; GFX90A-V2A-EN-NEXT: ;;#ASMEND +; GFX90A-V2A-EN-NEXT: ;;#ASMSTART +; GFX90A-V2A-EN-NEXT: ; clobber +; GFX90A-V2A-EN-NEXT: ;;#ASMEND +; GFX90A-V2A-EN-NEXT: ;;#ASMSTART +; GFX90A-V2A-EN-NEXT: ; clobber +; GFX90A-V2A-EN-NEXT: ;;#ASMEND +; GFX90A-V2A-EN-NEXT: ;;#ASMSTART +; GFX90A-V2A-EN-NEXT: ; clobber +; GFX90A-V2A-EN-NEXT: ;;#ASMEND +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a33, v1 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a32, v0 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: func_spill_vgpr_to_vmem: +; WAVE32: .Lfunc_begin4: +; WAVE32-NEXT: .cfi_startproc +; WAVE32-NEXT: ; %bb.0: ; %entry +; WAVE32-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE32-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1576, 32, 1, 32, 128 +; WAVE32-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1577, 32, 1, 32, 0 +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: s_clause 0x1 +; WAVE32-NEXT: buffer_load_dword v41, off, s[0:3], s32 +; WAVE32-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; WAVE32-NEXT: s_waitcnt vmcnt(0) +; WAVE32-NEXT: s_setpc_b64 s[30:31] entry: call void asm sideeffect "; clobber", "~{v40}"() #0 call void asm sideeffect "; clobber", "~{v41}"() #0 @@ -638,41 +3398,134 @@ entry: ret void } -; CHECK-LABEL: func_spill_vgpr_to_agpr: -; CHECK: .cfi_startproc - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK: %bb.0: -; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 -; CHECK-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 2560 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 2561 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3072 -; GFX90A-V2A-EN-NEXT: .cfi_undefined 3073 - -; CHECK-NOT: .cfi_{{.*}} - -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a[[#TMP_AGPR1:]], v[[#VGPR1:]] -; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask [[#VGPR1+2560]], [[#TMP_AGPR1+3072]], 32, 17, 64 -; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a[[#TMP_AGPR2]], v[[#VGPR2]] -; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask [[#VGPR2+2560]], [[#TMP_AGPR2+3072]], 32, 17, 64 -; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v[[#TMP_VGPR1:]], a[[#AGPR1:]] -; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask [[#AGPR1+3072]], [[#TMP_VGPR1+2560]], 32, 17, 64 -; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v[[#TMP_VGPR2:]], a[[#AGPR2:]] -; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask [[#AGPR2+3072]], [[#TMP_VGPR2+2560]], 32, 17, 64 -; GFX90A-V2A-EN: v_accvgpr_write_b32 a33, v1 -; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a32, v0 -; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v41, a1 -; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v40, a0 - -; CHECK: s_setpc_b64 s[30:31] - -; CHECK-NOT: .cfi_{{.*}} -; CHECK: .cfi_endproc - define hidden void @func_spill_vgpr_to_agpr() #2 { +; GFX900-LABEL: func_spill_vgpr_to_agpr: +; GFX900: .Lfunc_begin5: +; GFX900-NEXT: .cfi_startproc +; GFX900-NEXT: ; %bb.0: +; GFX900-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GFX900-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 256 +; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; clobber +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; clobber +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; clobber +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; clobber +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-V2A-DIS-LABEL: func_spill_vgpr_to_agpr: +; GFX90A-V2A-DIS: .Lfunc_begin5: +; GFX90A-V2A-DIS-NEXT: .cfi_startproc +; GFX90A-V2A-DIS-NEXT: ; %bb.0: +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; GFX90A-V2A-DIS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2600, 32, 17, 64, 768 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 2601, 32, 17, 64, 512 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 3104, 32, 17, 64, 256 +; GFX90A-V2A-DIS-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-V2A-DIS-NEXT: .cfi_llvm_vector_offset 3105, 32, 17, 64, 0 +; GFX90A-V2A-DIS-NEXT: ;;#ASMSTART +; GFX90A-V2A-DIS-NEXT: ; clobber +; GFX90A-V2A-DIS-NEXT: ;;#ASMEND +; GFX90A-V2A-DIS-NEXT: ;;#ASMSTART +; GFX90A-V2A-DIS-NEXT: ; clobber +; GFX90A-V2A-DIS-NEXT: ;;#ASMEND +; GFX90A-V2A-DIS-NEXT: ;;#ASMSTART +; GFX90A-V2A-DIS-NEXT: ; clobber +; GFX90A-V2A-DIS-NEXT: ;;#ASMEND +; GFX90A-V2A-DIS-NEXT: ;;#ASMSTART +; GFX90A-V2A-DIS-NEXT: ; clobber +; GFX90A-V2A-DIS-NEXT: ;;#ASMEND +; GFX90A-V2A-DIS-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-V2A-DIS-NEXT: s_waitcnt vmcnt(0) +; GFX90A-V2A-DIS-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-V2A-EN-LABEL: func_spill_vgpr_to_agpr: +; GFX90A-V2A-EN: .Lfunc_begin5: +; GFX90A-V2A-EN-NEXT: .cfi_startproc +; GFX90A-V2A-EN-NEXT: ; %bb.0: +; GFX90A-V2A-EN-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; GFX90A-V2A-EN-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2560 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 2561 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3072 +; GFX90A-V2A-EN-NEXT: .cfi_undefined 3073 +; GFX90A-V2A-EN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2600, 3072, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 2601, 3073, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v0, a32 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 3104, 2560, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v1, a33 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: .cfi_llvm_vector_register_mask 3105, 2561, 32, 17, 64 +; GFX90A-V2A-EN-NEXT: ;;#ASMSTART +; GFX90A-V2A-EN-NEXT: ; clobber +; GFX90A-V2A-EN-NEXT: ;;#ASMEND +; GFX90A-V2A-EN-NEXT: ;;#ASMSTART +; GFX90A-V2A-EN-NEXT: ; clobber +; GFX90A-V2A-EN-NEXT: ;;#ASMEND +; GFX90A-V2A-EN-NEXT: ;;#ASMSTART +; GFX90A-V2A-EN-NEXT: ; clobber +; GFX90A-V2A-EN-NEXT: ;;#ASMEND +; GFX90A-V2A-EN-NEXT: ;;#ASMSTART +; GFX90A-V2A-EN-NEXT: ; clobber +; GFX90A-V2A-EN-NEXT: ;;#ASMEND +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a33, v1 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_write_b32 a32, v0 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX90A-V2A-EN-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: func_spill_vgpr_to_agpr: +; WAVE32: .Lfunc_begin5: +; WAVE32-NEXT: .cfi_startproc +; WAVE32-NEXT: ; %bb.0: +; WAVE32-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; WAVE32-NEXT: .cfi_llvm_register_pair 16, 62, 32, 63, 32 +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1576, 32, 1, 32, 128 +; WAVE32-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; WAVE32-NEXT: .cfi_llvm_vector_offset 1577, 32, 1, 32, 0 +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: ;;#ASMSTART +; WAVE32-NEXT: ; clobber +; WAVE32-NEXT: ;;#ASMEND +; WAVE32-NEXT: s_clause 0x1 +; WAVE32-NEXT: buffer_load_dword v41, off, s[0:3], s32 +; WAVE32-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; WAVE32-NEXT: s_waitcnt vmcnt(0) +; WAVE32-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{v40}"() call void asm sideeffect "; clobber", "~{v41}"() call void asm sideeffect "; clobber", "~{a32}"() diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll index ff23c5f7f72c6..8e6eb66bd250e 100644 --- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -26,7 +26,7 @@ define internal void @direct() { define amdgpu_kernel void @test_direct_indirect_call() { ; CHECK-LABEL: define {{[^@]+}}@test_direct_indirect_call -; CHECK-SAME: () #[[ATTR2:[0-9]+]] { +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: call void @direct() ; CHECK-NEXT: ret void ; @@ -36,5 +36,4 @@ define amdgpu_kernel void @test_direct_indirect_call() { ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll new file mode 100644 index 0000000000000..007e3f0a6bdbc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=amdgpu-unify-divergent-exit-nodes -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s + +declare void @foo(ptr) +declare i1 @bar(ptr) + +define void @musttail_call_without_return_value(ptr %p) { +; CHECK-LABEL: define void @musttail_call_without_return_value( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i1, ptr [[P]], align 1 +; CHECK-NEXT: br i1 [[LOAD]], label %[[BB_0:.*]], label %[[BB_1:.*]] +; CHECK: [[BB_0]]: +; CHECK-NEXT: musttail call void @foo(ptr [[P]]) +; CHECK-NEXT: ret void +; CHECK: [[BB_1]]: +; CHECK-NEXT: ret void +; +entry: + %load = load i1, ptr %p, align 1 + br i1 %load, label %bb.0, label %bb.1 + +bb.0: + musttail call void @foo(ptr %p) + ret void + +bb.1: + ret void +} + +define i1 @musttail_call_with_return_value(ptr %p) { +; CHECK-LABEL: define i1 @musttail_call_with_return_value( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i1, ptr [[P]], align 1 +; CHECK-NEXT: br i1 [[LOAD]], label %[[BB_0:.*]], label %[[BB_1:.*]] +; CHECK: [[BB_0]]: +; CHECK-NEXT: [[RET:%.*]] = musttail call i1 @bar(ptr [[P]]) +; CHECK-NEXT: ret i1 [[RET]] +; CHECK: [[BB_1]]: +; CHECK-NEXT: ret i1 [[LOAD]] +; +entry: + %load = load i1, ptr %p, align 1 + br i1 %load, label %bb.0, label %bb.1 + +bb.0: + %ret = musttail call i1 @bar(ptr %p) + ret i1 %ret + +bb.1: + ret i1 %load +} diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 4d9c85ef99dcd..9b91a3dc9b6e4 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1321,19 +1321,19 @@ bb: define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspace(3) %arg) { ; CI-LABEL: ds_read_call_read: ; CI: ; %bb.0: -; CI-NEXT: s_getpc_b64 s[40:41] -; CI-NEXT: s_mov_b32 s40, s0 -; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0 +; CI-NEXT: s_getpc_b64 s[48:49] +; CI-NEXT: s_mov_b32 s48, s0 +; CI-NEXT: s_load_dwordx4 s[48:51], s[48:49], 0x0 ; CI-NEXT: s_mov_b32 s14, s10 ; CI-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s12, s8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s40, s40, s11 +; CI-NEXT: s_add_u32 s48, s48, s11 ; CI-NEXT: s_mov_b64 s[10:11], s[6:7] ; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 ; CI-NEXT: s_load_dword s6, s[4:5], 0x2 -; CI-NEXT: s_addc_u32 s41, s41, 0 +; CI-NEXT: s_addc_u32 s49, s49, 0 ; CI-NEXT: s_add_u32 s8, s4, 12 ; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; CI-NEXT: s_mov_b32 s13, s9 @@ -1345,11 +1345,11 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] -; CI-NEXT: s_mov_b64 s[0:1], s[40:41] +; CI-NEXT: s_mov_b64 s[0:1], s[48:49] ; CI-NEXT: s_mov_b32 s17, void_func_void@abs32@hi ; CI-NEXT: s_mov_b32 s16, void_func_void@abs32@lo ; CI-NEXT: v_or_b32_e32 v31, v0, v2 -; CI-NEXT: s_mov_b64 s[2:3], s[42:43] +; CI-NEXT: s_mov_b64 s[2:3], s[50:51] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_mov_b32 s39, 0xf000 ; CI-NEXT: s_mov_b32 s38, -1 diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index 1e530d55756ef..43bfe594b3be5 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -1,11 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=AKF_GCN %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s define internal void @indirect() { -; AKF_GCN-LABEL: define {{[^@]+}}@indirect() { -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@indirect ; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: ret void @@ -14,14 +10,6 @@ define internal void @indirect() { } define amdgpu_kernel void @test_simple_indirect_call() #0 { -; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call -; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] { -; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) -; AKF_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 -; AKF_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8 -; AKF_GCN-NEXT: call void [[FP]]() -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call ; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -38,8 +26,6 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 { attributes #0 = { "amdgpu-no-dispatch-id" } -;. -; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll index 4755516cf0094..884d712e93ebe 100644 --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -experimental-debug-variable-locations=false < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck %s ; Don't crash. @@ -447,6 +447,38 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: .cfi_undefined 59 ; CHECK-NEXT: .cfi_undefined 60 ; CHECK-NEXT: .cfi_undefined 61 +; CHECK-NEXT: .cfi_undefined 72 +; CHECK-NEXT: .cfi_undefined 73 +; CHECK-NEXT: .cfi_undefined 74 +; CHECK-NEXT: .cfi_undefined 75 +; CHECK-NEXT: .cfi_undefined 76 +; CHECK-NEXT: .cfi_undefined 77 +; CHECK-NEXT: .cfi_undefined 78 +; CHECK-NEXT: .cfi_undefined 79 +; CHECK-NEXT: .cfi_undefined 88 +; CHECK-NEXT: .cfi_undefined 89 +; CHECK-NEXT: .cfi_undefined 90 +; CHECK-NEXT: .cfi_undefined 91 +; CHECK-NEXT: .cfi_undefined 92 +; CHECK-NEXT: .cfi_undefined 93 +; CHECK-NEXT: .cfi_undefined 94 +; CHECK-NEXT: .cfi_undefined 95 +; CHECK-NEXT: .cfi_undefined 1096 +; CHECK-NEXT: .cfi_undefined 1097 +; CHECK-NEXT: .cfi_undefined 1098 +; CHECK-NEXT: .cfi_undefined 1099 +; CHECK-NEXT: .cfi_undefined 1100 +; CHECK-NEXT: .cfi_undefined 1101 +; CHECK-NEXT: .cfi_undefined 1102 +; CHECK-NEXT: .cfi_undefined 1103 +; CHECK-NEXT: .cfi_undefined 1112 +; CHECK-NEXT: .cfi_undefined 1113 +; CHECK-NEXT: .cfi_undefined 1114 +; CHECK-NEXT: .cfi_undefined 1115 +; CHECK-NEXT: .cfi_undefined 1116 +; CHECK-NEXT: .cfi_undefined 1117 +; CHECK-NEXT: .cfi_undefined 1118 +; CHECK-NEXT: .cfi_undefined 1119 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 @@ -472,70 +504,70 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: .cfi_llvm_vector_registers 70, 2601, 4, 32 ; CHECK-NEXT: v_writelane_b32 v41, s39, 5 ; CHECK-NEXT: .cfi_llvm_vector_registers 71, 2601, 5, 32 -; CHECK-NEXT: v_writelane_b32 v41, s40, 6 -; CHECK-NEXT: .cfi_llvm_vector_registers 72, 2601, 6, 32 -; CHECK-NEXT: v_writelane_b32 v41, s41, 7 -; CHECK-NEXT: .cfi_llvm_vector_registers 73, 2601, 7, 32 -; CHECK-NEXT: v_writelane_b32 v41, s42, 8 -; CHECK-NEXT: .cfi_llvm_vector_registers 74, 2601, 8, 32 -; CHECK-NEXT: v_writelane_b32 v41, s43, 9 -; CHECK-NEXT: .cfi_llvm_vector_registers 75, 2601, 9, 32 -; CHECK-NEXT: v_writelane_b32 v41, s44, 10 -; CHECK-NEXT: .cfi_llvm_vector_registers 76, 2601, 10, 32 -; CHECK-NEXT: v_writelane_b32 v41, s45, 11 -; CHECK-NEXT: .cfi_llvm_vector_registers 77, 2601, 11, 32 -; CHECK-NEXT: v_writelane_b32 v41, s46, 12 -; CHECK-NEXT: .cfi_llvm_vector_registers 78, 2601, 12, 32 -; CHECK-NEXT: v_writelane_b32 v41, s47, 13 -; CHECK-NEXT: .cfi_llvm_vector_registers 79, 2601, 13, 32 +; CHECK-NEXT: v_writelane_b32 v41, s48, 6 +; CHECK-NEXT: .cfi_llvm_vector_registers 80, 2601, 6, 32 +; CHECK-NEXT: v_writelane_b32 v41, s49, 7 +; CHECK-NEXT: .cfi_llvm_vector_registers 81, 2601, 7, 32 +; CHECK-NEXT: v_writelane_b32 v41, s50, 8 +; CHECK-NEXT: .cfi_llvm_vector_registers 82, 2601, 8, 32 +; CHECK-NEXT: v_writelane_b32 v41, s51, 9 +; CHECK-NEXT: .cfi_llvm_vector_registers 83, 2601, 9, 32 +; CHECK-NEXT: v_writelane_b32 v41, s52, 10 +; CHECK-NEXT: .cfi_llvm_vector_registers 84, 2601, 10, 32 +; CHECK-NEXT: v_writelane_b32 v41, s53, 11 +; CHECK-NEXT: .cfi_llvm_vector_registers 85, 2601, 11, 32 +; CHECK-NEXT: v_writelane_b32 v41, s54, 12 +; CHECK-NEXT: .cfi_llvm_vector_registers 86, 2601, 12, 32 +; CHECK-NEXT: v_writelane_b32 v41, s55, 13 +; CHECK-NEXT: .cfi_llvm_vector_registers 87, 2601, 13, 32 ; CHECK-NEXT: v_writelane_b32 v41, s30, 14 ; CHECK-NEXT: v_writelane_b32 v41, s31, 15 ; CHECK-NEXT: .cfi_llvm_vector_registers 16, 2601, 14, 32, 2601, 15, 32 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- undef ; CHECK-NEXT: .Ltmp0: ; CHECK-NEXT: .loc 1 49 9 prologue_end ; dummy:49:9 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, __kmpc_alloc_shared@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, __kmpc_alloc_shared@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[46:47] -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[46:47] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] ; CHECK-NEXT: .Ltmp1: -; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- [$vgpr0_vgpr1+0] +; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- [DW_OP_deref] undef ; CHECK-NEXT: .loc 1 0 9 is_stmt 0 ; dummy:0:9 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_readlane_b32 s30, v41, 14 ; CHECK-NEXT: flat_store_dword v[0:1], v2 ; CHECK-NEXT: v_readlane_b32 s31, v41, 15 -; CHECK-NEXT: v_readlane_b32 s47, v41, 13 -; CHECK-NEXT: v_readlane_b32 s46, v41, 12 -; CHECK-NEXT: v_readlane_b32 s45, v41, 11 -; CHECK-NEXT: v_readlane_b32 s44, v41, 10 -; CHECK-NEXT: v_readlane_b32 s43, v41, 9 -; CHECK-NEXT: v_readlane_b32 s42, v41, 8 -; CHECK-NEXT: v_readlane_b32 s41, v41, 7 -; CHECK-NEXT: v_readlane_b32 s40, v41, 6 +; CHECK-NEXT: v_readlane_b32 s55, v41, 13 +; CHECK-NEXT: v_readlane_b32 s54, v41, 12 +; CHECK-NEXT: v_readlane_b32 s53, v41, 11 +; CHECK-NEXT: v_readlane_b32 s52, v41, 10 +; CHECK-NEXT: v_readlane_b32 s51, v41, 9 +; CHECK-NEXT: v_readlane_b32 s50, v41, 8 +; CHECK-NEXT: v_readlane_b32 s49, v41, 7 +; CHECK-NEXT: v_readlane_b32 s48, v41, 6 ; CHECK-NEXT: v_readlane_b32 s39, v41, 5 ; CHECK-NEXT: v_readlane_b32 s38, v41, 4 ; CHECK-NEXT: v_readlane_b32 s37, v41, 3 diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index f05e1a3eacf62..87a39cdfaf2cd 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -731,7 +731,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX11-SDAG-NEXT: s_add_i32 s1, s1, 15 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_and_b32 s4, s1, -16 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_and_b32 s1, s3, 0xfffff800 ; GFX11-SDAG-NEXT: s_lshl_b32 s3, s4, 5 ; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s3 @@ -787,7 +787,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX11-GISEL-NEXT: s_lshl2_add_u32 s1, s1, 15 ; GFX11-GISEL-NEXT: s_add_u32 s3, s32, 0x7ff ; GFX11-GISEL-NEXT: s_and_b32 s1, s1, -16 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_lshl_b32 s4, s1, 5 ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-GISEL-NEXT: s_and_b32 s1, s3, 0xfffff800 diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir index 88ed63019088c..3cfb96fede71a 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir @@ -209,6 +209,38 @@ body: | ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX8-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base ; GFX8-NEXT: renamable $sgpr17 = S_MOV_B32 0 ; GFX8-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc @@ -410,6 +442,38 @@ body: | ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX900-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base ; GFX900-NEXT: renamable $sgpr17 = S_MOV_B32 0 ; GFX900-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc @@ -643,6 +707,38 @@ body: | ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 ; GFX90A-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc @@ -844,6 +940,38 @@ body: | ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX1010-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base ; GFX1010-NEXT: renamable $sgpr17 = S_MOV_B32 0 ; GFX1010-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc_lo @@ -1045,6 +1173,38 @@ body: | ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX1100-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base ; GFX1100-NEXT: renamable $sgpr17 = S_MOV_B32 0 ; GFX1100-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc_lo @@ -1054,9 +1214,8 @@ body: | ; GFX1100-NEXT: renamable $sgpr20 = S_MOV_B32 killed $sgpr22 ; GFX1100-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23 ; GFX1100-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27 - ; GFX1100-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, 8, implicit-def $scc - ; GFX1100-NEXT: renamable $sgpr31 = S_MOV_B32 $sgpr32 - ; GFX1100-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, -8, implicit-def $scc + ; GFX1100-NEXT: $sgpr40 = S_ADD_I32 $sgpr32, 8, implicit-def $scc + ; GFX1100-NEXT: renamable $sgpr31 = S_MOV_B32 killed $sgpr40 ; GFX1100-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec @@ -1247,6 +1406,38 @@ body: | ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX1200-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base ; GFX1200-NEXT: renamable $sgpr17 = S_MOV_B32 0 ; GFX1200-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc_lo @@ -1256,9 +1447,8 @@ body: | ; GFX1200-NEXT: renamable $sgpr20 = S_MOV_B32 killed $sgpr22 ; GFX1200-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23 ; GFX1200-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27 - ; GFX1200-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, 8, implicit-def $scc - ; GFX1200-NEXT: renamable $sgpr31 = S_MOV_B32 $sgpr32 - ; GFX1200-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, -8, implicit-def $scc + ; GFX1200-NEXT: $sgpr40 = S_ADD_I32 $sgpr32, 8, implicit-def $scc + ; GFX1200-NEXT: renamable $sgpr31 = S_MOV_B32 killed $sgpr40 ; GFX1200-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec @@ -2252,6 +2442,22 @@ body: | ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2262,8 +2468,9 @@ body: | ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc - ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 24, 64, $sgpr32, 0, implicit $exec - ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $vgpr0, implicit $exec + ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX8-NEXT: $sgpr4 = S_MOV_B32 24 + ; GFX8-NEXT: $vgpr0, dead $sgpr72_sgpr73 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec ; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 @@ -2310,6 +2517,22 @@ body: | ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2368,6 +2591,22 @@ body: | ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2426,6 +2665,22 @@ body: | ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2484,6 +2739,22 @@ body: | ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2494,10 +2765,10 @@ body: | ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc - ; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc - ; GFX1100-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc - ; GFX1100-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi - ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi + ; GFX1100-NEXT: $sgpr72 = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc + ; GFX1100-NEXT: S_BITCMP1_B32 $sgpr72, 0, implicit-def $scc + ; GFX1100-NEXT: $sgpr72 = S_BITSET0_B32 0, $sgpr72 + ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr72 ; GFX1100-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1100-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1100-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2543,6 +2814,22 @@ body: | ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2553,10 +2840,10 @@ body: | ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc - ; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc - ; GFX1200-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc - ; GFX1200-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi - ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi + ; GFX1200-NEXT: $sgpr72 = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc + ; GFX1200-NEXT: S_BITCMP1_B32 $sgpr72, 0, implicit-def $scc + ; GFX1200-NEXT: $sgpr72 = S_BITSET0_B32 0, $sgpr72 + ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr72 ; GFX1200-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1200-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1200-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2638,6 +2925,22 @@ body: | ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2648,9 +2951,9 @@ body: | ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc - ; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 68, implicit $exec - ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec - ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $vgpr0, implicit $exec + ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX8-NEXT: $sgpr4 = S_MOV_B32 68 + ; GFX8-NEXT: $vgpr0, dead $sgpr72_sgpr73 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec ; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 @@ -2697,6 +3000,22 @@ body: | ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX900-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2755,6 +3074,22 @@ body: | ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2813,6 +3148,22 @@ body: | ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX1010-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2871,6 +3222,22 @@ body: | ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX1100-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2881,10 +3248,10 @@ body: | ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc - ; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc - ; GFX1100-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc - ; GFX1100-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi - ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi + ; GFX1100-NEXT: $sgpr72 = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc + ; GFX1100-NEXT: S_BITCMP1_B32 $sgpr72, 0, implicit-def $scc + ; GFX1100-NEXT: $sgpr72 = S_BITSET0_B32 0, $sgpr72 + ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr72 ; GFX1100-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1100-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1100-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2930,6 +3297,22 @@ body: | ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX1200-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 @@ -2940,10 +3323,10 @@ body: | ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc - ; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc - ; GFX1200-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc - ; GFX1200-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi - ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi + ; GFX1200-NEXT: $sgpr72 = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc + ; GFX1200-NEXT: S_BITCMP1_B32 $sgpr72, 0, implicit-def $scc + ; GFX1200-NEXT: $sgpr72 = S_BITSET0_B32 0, $sgpr72 + ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr72 ; GFX1200-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1200-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX1200-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index 67f2487aed73a..c69b0cce3d208 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -29,20 +29,18 @@ entry: define amdgpu_kernel void @int4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: int4_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s6, 1 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s6, 2 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s6, 3 -; GCN-NEXT: v_cndmask_b32_e32 v0, 2, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v2, 4, v0, vcc +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s3, 1, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_cselect_b32 s3, s3, 2 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cselect_b32 s2, s3, 4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm entry: @@ -303,12 +301,11 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s10, s0 ; GCN-NEXT: s_mov_b32 s12, s0 ; GCN-NEXT: s_mov_b32 s14, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s18, s18, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v15, s15 -; GCN-NEXT: s_mov_b32 m0, s18 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 m0, s18, 1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s4 @@ -354,11 +351,10 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s10, s0 ; GCN-NEXT: s_mov_b32 s12, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s16, s16, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v15, s15 -; GCN-NEXT: s_mov_b32 m0, s16 +; GCN-NEXT: s_lshl_b32 m0, s16, 1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s4 @@ -453,12 +449,11 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s60, s36 ; GCN-NEXT: s_mov_b32 s62, s36 ; GCN-NEXT: s_mov_b32 s64, s36 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v31, s67 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 m0, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NEXT: v_mov_b32_e32 v4, s40 @@ -537,12 +532,11 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s62, s36 ; GCN-NEXT: s_mov_b32 s64, s36 ; GCN-NEXT: s_mov_b32 s66, s36 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v31, s67 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 m0, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NEXT: v_mov_b32_e32 v4, s40 diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll index 23b54c6741e51..a00abecabdffb 100644 --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -1807,6 +1807,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -1940,6 +1941,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2072,6 +2074,7 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2204,6 +2207,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2337,6 +2341,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2470,6 +2475,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2603,6 +2609,7 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2736,6 +2743,7 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -2868,6 +2876,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -3000,6 +3009,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -3133,6 +3143,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -3265,6 +3276,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -3397,6 +3409,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm @@ -3529,6 +3542,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index a457338873157..fec04a27cda91 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1529,42 +1529,35 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; SI-NEXT: v_med3_i32 v1, s6, 0, 13 ; SI-NEXT: s_or_b32 s4, s2, 0x1000 ; SI-NEXT: v_readfirstlane_b32 s6, v1 -; SI-NEXT: s_lshr_b32 s6, s4, s6 -; SI-NEXT: v_lshl_b32_e32 v1, s6, v1 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-NEXT: s_add_i32 s8, s5, 0xfffffc10 -; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: s_lshl_b32 s5, s8, 12 -; SI-NEXT: s_or_b32 s4, s6, s4 -; SI-NEXT: s_or_b32 s5, s2, s5 -; SI-NEXT: s_cmp_lt_i32 s8, 1 -; SI-NEXT: s_cselect_b32 s9, s4, s5 -; SI-NEXT: s_and_b32 s6, s9, 7 +; SI-NEXT: s_lshr_b32 s7, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s7, s6 +; SI-NEXT: s_cmp_lg_u32 s6, s4 +; SI-NEXT: s_cselect_b32 s4, 1, 0 +; SI-NEXT: s_addk_i32 s5, 0xfc10 +; SI-NEXT: s_lshl_b32 s6, s5, 12 +; SI-NEXT: s_or_b32 s4, s7, s4 +; SI-NEXT: s_or_b32 s6, s2, s6 +; SI-NEXT: s_cmp_lt_i32 s5, 1 +; SI-NEXT: s_cselect_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s4, 7 ; SI-NEXT: s_cmp_gt_i32 s6, 5 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_cselect_b32 s7, 1, 0 ; SI-NEXT: s_cmp_eq_u32 s6, 3 -; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SI-NEXT: s_lshr_b32 s6, s9, 2 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_cmp_lg_u32 s4, 0 -; SI-NEXT: s_addc_u32 s4, s6, 0 -; SI-NEXT: s_cmp_lt_i32 s8, 31 -; SI-NEXT: s_cselect_b32 s6, s4, 0x7c00 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_lshr_b32 s4, s4, 2 +; SI-NEXT: s_add_i32 s4, s4, s6 +; SI-NEXT: s_cmp_lt_i32 s5, 31 +; SI-NEXT: s_cselect_b32 s4, s4, 0x7c00 ; SI-NEXT: s_cmp_lg_u32 s2, 0 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; SI-NEXT: v_lshlrev_b32_e32 v1, 9, v1 -; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f -; SI-NEXT: v_or_b32_e32 v1, 0x7c00, v1 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s2, s3, 16 -; SI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; SI-NEXT: s_and_b32 s2, s2, 0x8000 -; SI-NEXT: v_or_b32_e32 v1, s2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_movk_i32 s2, 0x7e00 +; SI-NEXT: s_cselect_b32 s2, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s5, 0x40f +; SI-NEXT: s_cselect_b32 s2, s2, s4 +; SI-NEXT: s_lshr_b32 s3, s3, 16 +; SI-NEXT: s_and_b32 s3, s3, 0x8000 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 @@ -1587,47 +1580,42 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: s_bfe_u32 s1, s3, 0xb0014 -; VI-NEXT: v_readfirstlane_b32 s0, v2 -; VI-NEXT: s_sub_i32 s2, 0x3f1, s1 -; VI-NEXT: s_or_b32 s5, s5, s0 -; VI-NEXT: v_med3_i32 v2, s2, 0, 13 -; VI-NEXT: s_or_b32 s0, s5, 0x1000 -; VI-NEXT: v_readfirstlane_b32 s2, v2 -; VI-NEXT: s_lshr_b32 s2, s0, s2 -; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, s0, v2 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-NEXT: s_add_i32 s6, s1, 0xfffffc10 +; VI-NEXT: s_bfe_u32 s2, s3, 0xb0014 ; VI-NEXT: v_readfirstlane_b32 s0, v2 -; VI-NEXT: s_lshl_b32 s1, s6, 12 -; VI-NEXT: s_or_b32 s0, s2, s0 +; VI-NEXT: s_sub_i32 s3, 0x3f1, s2 +; VI-NEXT: s_or_b32 s0, s5, s0 +; VI-NEXT: v_med3_i32 v2, s3, 0, 13 +; VI-NEXT: s_or_b32 s1, s0, 0x1000 +; VI-NEXT: v_readfirstlane_b32 s3, v2 +; VI-NEXT: s_lshr_b32 s5, s1, s3 +; VI-NEXT: s_lshl_b32 s3, s5, s3 +; VI-NEXT: s_cmp_lg_u32 s3, s1 +; VI-NEXT: s_cselect_b32 s1, 1, 0 +; VI-NEXT: s_addk_i32 s2, 0xfc10 +; VI-NEXT: s_lshl_b32 s3, s2, 12 ; VI-NEXT: s_or_b32 s1, s5, s1 -; VI-NEXT: s_cmp_lt_i32 s6, 1 -; VI-NEXT: s_cselect_b32 s7, s0, s1 -; VI-NEXT: s_and_b32 s2, s7, 7 -; VI-NEXT: s_cmp_gt_i32 s2, 5 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s2, 3 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; VI-NEXT: s_lshr_b32 s2, s7, 2 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_addc_u32 s0, s2, 0 -; VI-NEXT: s_cmp_lt_i32 s6, 31 -; VI-NEXT: s_cselect_b32 s2, s0, 0x7c00 -; VI-NEXT: s_cmp_lg_u32 s5, 0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: v_lshlrev_b32_e32 v2, 9, v2 -; VI-NEXT: s_cmpk_eq_i32 s6, 0x40f -; VI-NEXT: v_or_b32_e32 v2, 0x7c00, v2 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; VI-NEXT: s_movk_i32 s0, 0x7fff +; VI-NEXT: s_or_b32 s3, s0, s3 +; VI-NEXT: s_cmp_lt_i32 s2, 1 +; VI-NEXT: s_cselect_b32 s1, s1, s3 +; VI-NEXT: s_and_b32 s3, s1, 7 +; VI-NEXT: s_cmp_gt_i32 s3, 5 +; VI-NEXT: s_cselect_b32 s5, 1, 0 +; VI-NEXT: s_cmp_eq_u32 s3, 3 +; VI-NEXT: s_cselect_b32 s3, 1, 0 +; VI-NEXT: s_or_b32 s3, s3, s5 +; VI-NEXT: s_lshr_b32 s1, s1, 2 +; VI-NEXT: s_add_i32 s1, s1, s3 +; VI-NEXT: s_cmp_lt_i32 s2, 31 +; VI-NEXT: s_cselect_b32 s1, s1, 0x7c00 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_movk_i32 s0, 0x7e00 +; VI-NEXT: s_cselect_b32 s0, s0, 0x7c00 +; VI-NEXT: s_cmpk_eq_i32 s2, 0x40f +; VI-NEXT: s_cselect_b32 s0, s0, s1 +; VI-NEXT: s_movk_i32 s1, 0x7fff +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 +; VI-NEXT: v_bfi_b32 v2, s1, v2, v3 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1646,45 +1634,40 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX9-NEXT: s_bfe_u32 s3, s3, 0xb0014 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: s_sub_i32 s4, 0x3f1, s3 -; GFX9-NEXT: s_or_b32 s7, s7, s2 -; GFX9-NEXT: v_med3_i32 v1, s4, 0, 13 -; GFX9-NEXT: s_or_b32 s2, s7, 0x1000 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_lshr_b32 s4, s2, s4 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s4 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s2, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: s_add_i32 s8, s3, 0xfffffc10 -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: s_lshl_b32 s3, s8, 12 -; GFX9-NEXT: s_or_b32 s2, s4, s2 -; GFX9-NEXT: s_or_b32 s3, s7, s3 -; GFX9-NEXT: s_cmp_lt_i32 s8, 1 -; GFX9-NEXT: s_cselect_b32 s9, s2, s3 -; GFX9-NEXT: s_and_b32 s4, s9, 7 -; GFX9-NEXT: s_cmp_gt_i32 s4, 5 -; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s4, 3 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] -; GFX9-NEXT: s_lshr_b32 s4, s9, 2 -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9-NEXT: s_addc_u32 s2, s4, 0 -; GFX9-NEXT: s_cmp_lt_i32 s8, 31 -; GFX9-NEXT: s_cselect_b32 s4, s2, 0x7c00 -; GFX9-NEXT: s_cmp_lg_u32 s7, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 9, v1 -; GFX9-NEXT: s_cmpk_eq_i32 s8, 0x40f -; GFX9-NEXT: v_or_b32_e32 v1, 0x7c00, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_sub_i32 s5, 0x3f1, s3 +; GFX9-NEXT: s_or_b32 s2, s7, s2 +; GFX9-NEXT: v_med3_i32 v1, s5, 0, 13 +; GFX9-NEXT: s_or_b32 s4, s2, 0x1000 +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: s_lshr_b32 s7, s4, s5 +; GFX9-NEXT: s_lshl_b32 s5, s7, s5 +; GFX9-NEXT: s_cmp_lg_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_addk_i32 s3, 0xfc10 +; GFX9-NEXT: s_lshl_b32 s5, s3, 12 +; GFX9-NEXT: s_or_b32 s4, s7, s4 +; GFX9-NEXT: s_or_b32 s5, s2, s5 +; GFX9-NEXT: s_cmp_lt_i32 s3, 1 +; GFX9-NEXT: s_cselect_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s4, 7 +; GFX9-NEXT: s_cmp_gt_i32 s5, 5 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s5, 3 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_lshr_b32 s4, s4, 2 +; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: s_cmp_lt_i32 s3, 31 +; GFX9-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_movk_i32 s2, 0x7e00 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7c00 +; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x40f +; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_movk_i32 s3, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_bfi_b32 v1, s3, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1704,51 +1687,48 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: s_bfe_u32 s2, s3, 0xb0014 ; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s2 -; GFX11-NEXT: s_addk_i32 s2, 0xfc10 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13 ; GFX11-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-NEXT: s_lshl_b32 s7, s2, 12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_readfirstlane_b32 s6, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_or_b32 s3, s5, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s5, s3, 0x1000 -; GFX11-NEXT: s_or_b32 s7, s3, s7 -; GFX11-NEXT: s_lshr_b32 s6, s5, s6 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, s6 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-NEXT: s_or_b32 s5, s6, s5 -; GFX11-NEXT: s_cmp_lt_i32 s2, 1 -; GFX11-NEXT: s_cselect_b32 s5, s5, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s7, s5, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s6, s7, s6 +; GFX11-NEXT: s_cmp_lg_u32 s6, s5 +; GFX11-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-NEXT: s_addk_i32 s2, 0xfc10 +; GFX11-NEXT: s_or_b32 s5, s7, s5 +; GFX11-NEXT: s_lshl_b32 s6, s2, 12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s6, s3, s6 +; GFX11-NEXT: s_cmp_lt_i32 s2, 1 +; GFX11-NEXT: s_cselect_b32 s5, s5, s6 ; GFX11-NEXT: s_and_b32 s6, s5, 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_gt_i32 s6, 5 -; GFX11-NEXT: s_cselect_b32 s7, -1, 0 +; GFX11-NEXT: s_cselect_b32 s7, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 +; GFX11-NEXT: s_cselect_b32 s6, 1, 0 ; GFX11-NEXT: s_lshr_b32 s5, s5, 2 ; GFX11-NEXT: s_or_b32 s6, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s6, 0 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-NEXT: s_add_i32 s5, s5, s6 ; GFX11-NEXT: s_cmp_lt_i32 s2, 31 +; GFX11-NEXT: s_movk_i32 s6, 0x7e00 ; GFX11-NEXT: s_cselect_b32 s5, s5, 0x7c00 ; GFX11-NEXT: s_cmp_lg_u32 s3, 0 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cselect_b32 s3, s6, 0x7c00 ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 9, v0 -; GFX11-NEXT: v_or_b32_e32 v0, 0x7c00, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 +; GFX11-NEXT: s_cselect_b32 s2, s3, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %mag.trunc = fptrunc double %mag to half diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll index a324ba35b155f..3983655285e57 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll @@ -1499,8 +1499,7 @@ define float @v_recip_sqrt_f32_ulp25(float %x) { ; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CODEGEN-IEEE-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; CODEGEN-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CODEGEN-IEEE-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; CODEGEN-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; CODEGEN-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 ; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc @@ -1535,8 +1534,7 @@ define float @v_recip_sqrt_f32_ulp25(float %x) { ; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; IR-IEEE-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; IR-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; IR-IEEE-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; IR-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 ; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 72f883928cffb..707cae9534830 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -5705,6 +5705,7 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execz .LBB30_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5714,7 +5715,6 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: @@ -6059,6 +6059,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -6075,7 +6076,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB31_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB31_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -6104,6 +6105,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB31_2 ; GFX12-NEXT: .LBB31_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6111,7 +6113,6 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6479,6 +6480,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -6495,7 +6497,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB32_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB32_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -6524,6 +6526,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB32_2 ; GFX12-NEXT: .LBB32_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6531,7 +6534,6 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6912,7 +6914,6 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: .LBB33_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -6940,6 +6941,7 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execz .LBB33_2 ; GFX12-NEXT: .LBB33_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6947,7 +6949,6 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -7291,6 +7292,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -7306,7 +7308,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB34_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -7334,6 +7336,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB34_2 ; GFX12-NEXT: .LBB34_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -7341,7 +7344,6 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7701,6 +7703,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -7716,7 +7719,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB35_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -7744,6 +7747,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB35_2 ; GFX12-NEXT: .LBB35_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -7751,7 +7755,6 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8150,7 +8153,6 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -8405,6 +8407,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8440,7 +8443,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8704,6 +8706,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8739,7 +8742,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9036,7 +9038,6 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9282,6 +9283,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -9315,7 +9317,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9570,6 +9571,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -9603,7 +9605,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9880,7 +9881,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10097,7 +10097,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10297,6 +10296,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -10333,7 +10333,6 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10599,6 +10598,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -10633,7 +10633,6 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10915,8 +10914,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -10936,7 +10936,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -11236,6 +11235,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11259,8 +11259,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11280,7 +11281,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11590,6 +11590,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11613,8 +11614,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11634,7 +11636,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11945,6 +11946,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11968,6 +11970,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11987,7 +11990,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12288,6 +12290,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -12311,6 +12314,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -12330,7 +12334,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12646,6 +12649,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -12664,7 +12668,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12928,8 +12931,9 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -12947,7 +12951,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -13215,6 +13218,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -13234,7 +13238,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -13525,6 +13528,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -13548,8 +13552,9 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -13570,7 +13575,6 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13882,6 +13886,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -13905,6 +13910,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -13925,7 +13931,6 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 36aa73fbf8e92..75eb68557b174 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -2808,6 +2808,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execz .LBB18_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2819,7 +2820,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -3161,6 +3161,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -3177,7 +3178,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -3207,6 +3208,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB19_2 ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3216,7 +3218,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3573,6 +3574,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -3589,7 +3591,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -3619,6 +3621,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB20_2 ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3628,7 +3631,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3998,7 +4000,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] @@ -4028,6 +4029,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4037,7 +4039,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -4372,6 +4373,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -4387,7 +4389,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] @@ -4417,6 +4419,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4426,7 +4429,6 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -4777,6 +4779,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -4792,7 +4795,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB23_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] @@ -4822,6 +4825,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4831,7 +4835,6 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5219,6 +5222,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_cbranch_execz .LBB24_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5230,7 +5234,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -5642,6 +5645,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_cbranch_execz .LBB25_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5653,7 +5657,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -6034,7 +6037,6 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -6301,6 +6303,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -6338,7 +6341,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6614,6 +6616,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -6651,7 +6654,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6963,7 +6965,6 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -7222,6 +7223,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7258,7 +7260,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7526,6 +7527,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7562,7 +7564,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7856,7 +7857,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8092,7 +8092,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8299,6 +8298,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8337,7 +8337,6 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8615,6 +8614,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8652,7 +8652,6 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8947,8 +8946,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8968,7 +8968,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9269,6 +9268,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9292,8 +9292,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -9313,7 +9314,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9624,6 +9624,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9647,8 +9648,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -9668,7 +9670,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10001,6 +10002,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -10020,7 +10022,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -10312,6 +10313,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10335,6 +10337,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10354,7 +10357,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10656,6 +10658,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10679,6 +10682,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10698,7 +10702,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11015,6 +11018,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -11033,7 +11037,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11298,8 +11301,9 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11317,7 +11321,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -11565,6 +11568,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11588,8 +11592,9 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11610,7 +11615,6 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11923,6 +11927,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11946,6 +11951,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11966,7 +11972,6 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12296,7 +12301,6 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -12532,7 +12536,6 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12771,7 +12774,6 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13027,7 +13029,6 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -13254,7 +13255,6 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13488,7 +13488,6 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13742,7 +13741,6 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13983,7 +13981,6 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14223,9 +14220,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14242,7 +14240,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -14569,9 +14566,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14588,7 +14586,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14918,9 +14915,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14937,7 +14935,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15284,8 +15281,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15302,7 +15301,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15619,8 +15617,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15637,7 +15637,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15961,8 +15960,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15979,7 +15980,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16322,9 +16322,10 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -16342,7 +16343,6 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16673,8 +16673,10 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16692,7 +16694,6 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index d96d3db9f005d..a05e4a0cb2396 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -2808,6 +2808,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execz .LBB18_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2819,7 +2820,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -3161,6 +3161,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -3177,7 +3178,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -3207,6 +3208,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB19_2 ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3216,7 +3218,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3573,6 +3574,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -3589,7 +3591,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -3619,6 +3621,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB20_2 ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3628,7 +3631,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3998,7 +4000,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] @@ -4028,6 +4029,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4037,7 +4039,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -4372,6 +4373,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -4387,7 +4389,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] @@ -4417,6 +4419,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4426,7 +4429,6 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -4777,6 +4779,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -4792,7 +4795,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB23_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] @@ -4822,6 +4825,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4831,7 +4835,6 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5219,6 +5222,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_cbranch_execz .LBB24_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5230,7 +5234,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -5642,6 +5645,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_cbranch_execz .LBB25_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5653,7 +5657,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -6034,7 +6037,6 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -6301,6 +6303,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -6338,7 +6341,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6614,6 +6616,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -6651,7 +6654,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6963,7 +6965,6 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -7222,6 +7223,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7258,7 +7260,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7526,6 +7527,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7562,7 +7564,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7856,7 +7857,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8092,7 +8092,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -8299,6 +8298,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8337,7 +8337,6 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8615,6 +8614,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8652,7 +8652,6 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8947,8 +8946,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8968,7 +8968,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9269,6 +9268,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9292,8 +9292,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -9313,7 +9314,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9624,6 +9624,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9647,8 +9648,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -9668,7 +9670,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10001,6 +10002,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -10020,7 +10022,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -10312,6 +10313,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10335,6 +10337,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10354,7 +10357,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10656,6 +10658,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10679,6 +10682,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10698,7 +10702,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11015,6 +11018,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -11033,7 +11037,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11298,8 +11301,9 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11317,7 +11321,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -11565,6 +11568,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11588,8 +11592,9 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11610,7 +11615,6 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11923,6 +11927,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11946,6 +11951,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11966,7 +11972,6 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12296,7 +12301,6 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -12532,7 +12536,6 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12771,7 +12774,6 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13027,7 +13029,6 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -13254,7 +13255,6 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13488,7 +13488,6 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13742,7 +13741,6 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13983,7 +13981,6 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14223,9 +14220,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14242,7 +14240,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -14569,9 +14566,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14588,7 +14586,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14918,9 +14915,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14937,7 +14935,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15284,8 +15281,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15302,7 +15301,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15619,8 +15617,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15637,7 +15637,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15961,8 +15960,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15979,7 +15980,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16322,9 +16322,10 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -16342,7 +16343,6 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16673,8 +16673,10 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16692,7 +16694,6 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 9c2a76380d83d..cd1a161346667 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -41,7 +41,6 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32: @@ -237,7 +236,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: @@ -437,7 +435,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -653,7 +650,6 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32: @@ -839,7 +835,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1032,7 +1027,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1246,7 +1240,6 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: @@ -1447,7 +1440,6 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1648,7 +1640,6 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: @@ -1844,7 +1835,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -2044,7 +2034,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2260,7 +2249,6 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__ftz: @@ -2446,7 +2434,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -2639,7 +2626,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -2853,7 +2839,6 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -3054,7 +3039,6 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3270,6 +3254,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: s_cbranch_execz .LBB16_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3279,7 +3264,6 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64: @@ -3642,6 +3626,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -3658,7 +3643,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: .LBB17_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB17_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -3687,6 +3672,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_cbranch_execz .LBB17_2 ; GFX12-NEXT: .LBB17_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3694,7 +3680,6 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: @@ -4088,6 +4073,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -4104,7 +4090,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] @@ -4133,6 +4119,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_cbranch_execz .LBB18_2 ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4140,7 +4127,6 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -4547,7 +4533,6 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -4575,6 +4560,7 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: s_cbranch_execz .LBB19_2 ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4582,7 +4568,6 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64: @@ -4952,6 +4937,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -4967,7 +4953,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -4995,6 +4981,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_cbranch_execz .LBB20_2 ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5002,7 +4989,6 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -5388,6 +5374,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo @@ -5403,7 +5390,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] @@ -5431,6 +5418,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5438,7 +5426,6 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -5863,7 +5850,6 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16: @@ -6118,6 +6104,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -6153,7 +6140,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -6417,6 +6403,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -6452,7 +6439,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -6749,7 +6735,6 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16: @@ -6995,6 +6980,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -7028,7 +7014,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -7283,6 +7268,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -7316,7 +7302,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7595,7 +7580,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -7816,7 +7800,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -8010,6 +7993,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8046,7 +8030,6 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: @@ -8312,6 +8295,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8346,7 +8330,6 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8628,8 +8611,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8649,7 +8633,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16: @@ -8949,6 +8932,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8972,8 +8956,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8993,7 +8978,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -9303,6 +9287,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9326,8 +9311,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -9347,7 +9333,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -9679,6 +9664,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -9698,7 +9684,6 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16: @@ -9989,6 +9974,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10012,6 +9998,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10031,7 +10018,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -10332,6 +10318,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10355,6 +10342,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10374,7 +10362,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -10690,6 +10677,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -10708,7 +10696,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -10972,8 +10959,9 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10991,7 +10979,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -11238,6 +11225,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11261,8 +11249,9 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11283,7 +11272,6 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -11595,6 +11583,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11618,6 +11607,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11638,7 +11628,6 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -11965,7 +11954,6 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16: @@ -12186,7 +12174,6 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -12410,7 +12397,6 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -12650,7 +12636,6 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16: @@ -12860,7 +12845,6 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13077,7 +13061,6 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -13315,7 +13298,6 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -13540,7 +13522,6 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13766,9 +13747,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -13785,7 +13767,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16: @@ -14112,9 +14093,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14131,7 +14113,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -14461,9 +14442,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14480,7 +14462,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -14827,8 +14808,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -14845,7 +14828,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16: @@ -15162,8 +15144,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15180,7 +15164,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -15504,8 +15487,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15522,7 +15507,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -15865,9 +15849,10 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -15885,7 +15870,6 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -16216,8 +16200,10 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16235,7 +16221,6 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index b4d7ff8e7c526..0b6bdedeb48fc 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -128,7 +128,6 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB0_2 ; GFX12-NEXT: .LBB0_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -271,7 +270,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB1_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB1_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -428,7 +426,6 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB2_2 ; GFX12-NEXT: .LBB2_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -577,7 +574,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB3_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB3_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -1002,7 +998,6 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB6_2 ; GFX12-NEXT: .LBB6_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -1145,7 +1140,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB7_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -1287,7 +1281,6 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB8_2 ; GFX12-NEXT: .LBB8_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -1428,7 +1421,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB9_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB9_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -1583,7 +1575,6 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB10_2 ; GFX12-NEXT: .LBB10_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -1730,7 +1721,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB11_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB11_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -2149,7 +2139,6 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB14_2 ; GFX12-NEXT: .LBB14_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -2290,7 +2279,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB15_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB15_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -2434,7 +2422,6 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB16_2 ; GFX12-NEXT: .LBB16_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -2577,7 +2564,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB17_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB17_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -2734,7 +2720,6 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB18_2 ; GFX12-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -2883,7 +2868,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB19_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB19_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -3308,7 +3292,6 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB22_2 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -3451,7 +3434,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB23_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -3595,7 +3577,6 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB24_2 ; GFX12-NEXT: .LBB24_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -3741,7 +3722,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB25_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB25_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -3899,7 +3879,6 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB26_2 ; GFX12-NEXT: .LBB26_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -4051,7 +4030,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB27_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB27_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -4481,7 +4459,6 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB30_2 ; GFX12-NEXT: .LBB30_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -4627,7 +4604,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB31_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB31_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -4772,7 +4748,6 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB32_2 ; GFX12-NEXT: .LBB32_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -4918,7 +4893,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: .LBB33_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB33_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -5076,7 +5050,6 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB34_2 ; GFX12-NEXT: .LBB34_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -5228,7 +5201,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: .LBB35_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB35_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -5658,7 +5630,6 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB38_2 ; GFX12-NEXT: .LBB38_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -5804,7 +5775,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: .LBB39_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB39_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -5949,7 +5919,6 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB40_2 ; GFX12-NEXT: .LBB40_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -6095,7 +6064,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB41_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB41_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -6253,7 +6221,6 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB42_2 ; GFX12-NEXT: .LBB42_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -6405,7 +6372,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB43_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB43_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -6835,7 +6801,6 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB46_2 ; GFX12-NEXT: .LBB46_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -6981,7 +6946,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB47_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB47_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -7126,7 +7090,6 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB48_2 ; GFX12-NEXT: .LBB48_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -7272,7 +7235,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: .LBB49_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB49_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -7430,7 +7392,6 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB50_2 ; GFX12-NEXT: .LBB50_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -7582,7 +7543,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: .LBB51_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB51_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -8012,7 +7972,6 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_cbranch_execnz .LBB54_2 ; GFX12-NEXT: .LBB54_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -8158,7 +8117,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: .LBB55_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB55_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -8301,7 +8259,6 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB56_2 ; GFX12-NEXT: .LBB56_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -8442,7 +8399,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: .LBB57_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB57_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -8597,7 +8553,6 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB58_2 ; GFX12-NEXT: .LBB58_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -8744,7 +8699,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB59_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB59_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -9163,7 +9117,6 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB62_2 ; GFX12-NEXT: .LBB62_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -9304,7 +9257,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: .LBB63_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB63_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -9439,7 +9391,6 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_cbranch_execnz .LBB64_2 ; GFX12-NEXT: .LBB64_4: ; %atomicrmw.private ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -9566,7 +9517,6 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX12-NEXT: s_cbranch_execnz .LBB65_2 ; GFX12-NEXT: .LBB65_4: ; %atomicrmw.private ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -9693,7 +9643,6 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX12-NEXT: s_cbranch_execnz .LBB66_2 ; GFX12-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -9828,7 +9777,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: .LBB67_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB67_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 @@ -9975,7 +9923,6 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_cbranch_execnz .LBB68_2 ; GFX12-NEXT: .LBB68_4: ; %atomicrmw.private ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -10116,7 +10063,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: .LBB69_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB69_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 @@ -10513,7 +10459,6 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_cbranch_execnz .LBB72_2 ; GFX12-NEXT: .LBB72_4: ; %atomicrmw.private ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -10648,7 +10593,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: .LBB73_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB73_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 @@ -10789,7 +10733,6 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB74_2 ; GFX12-NEXT: .LBB74_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -10930,7 +10873,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB75_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB75_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -11085,7 +11027,6 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB76_2 ; GFX12-NEXT: .LBB76_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -11232,7 +11173,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB77_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB77_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -11651,7 +11591,6 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB80_2 ; GFX12-NEXT: .LBB80_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -11792,7 +11731,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB81_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB81_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -12355,7 +12293,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB90_2 ; GFX12-NEXT: .LBB90_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -12507,7 +12444,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB91_2 ; GFX12-NEXT: .LBB91_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -12654,7 +12590,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12-NEXT: .LBB92_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB92_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -12816,7 +12751,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB93_2 ; GFX12-NEXT: .LBB93_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -12977,7 +12911,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: .LBB94_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB94_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12-NEXT: s_cselect_b32 s2, s2, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 @@ -13425,7 +13358,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB97_2 ; GFX12-NEXT: .LBB97_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -13580,7 +13512,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: .LBB98_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB98_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12-NEXT: s_cselect_b32 s2, s2, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 @@ -14138,7 +14069,6 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB107_2 ; GFX12-NEXT: .LBB107_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -14146,6 +14076,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -14288,7 +14219,6 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB108_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB108_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -14296,6 +14226,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 @@ -14452,7 +14383,6 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB109_2 ; GFX12-NEXT: .LBB109_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -14460,6 +14390,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -14608,7 +14539,6 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB110_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB110_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -14616,6 +14546,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 @@ -14762,6 +14693,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -14905,6 +14837,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 @@ -15054,7 +14987,6 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB113_2 ; GFX12-NEXT: .LBB113_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -15062,6 +14994,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 @@ -15204,7 +15137,6 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB114_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB114_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s0, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 @@ -15212,6 +15144,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 @@ -15361,7 +15294,6 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB115_2 ; GFX12-NEXT: .LBB115_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s4, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 @@ -15369,11 +15301,11 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] ; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 @@ -15524,7 +15456,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB116_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB116_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s6, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 @@ -15532,11 +15463,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 @@ -15699,7 +15630,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB117_2 ; GFX12-NEXT: .LBB117_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s4, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 @@ -15707,11 +15637,11 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] ; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 @@ -15868,7 +15798,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: .LBB118_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB118_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s6, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 @@ -15876,11 +15805,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 @@ -16033,11 +15962,11 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] ; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 @@ -16189,11 +16118,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 @@ -16349,7 +16278,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB121_2 ; GFX12-NEXT: .LBB121_4: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s4, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 @@ -16357,11 +16285,11 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] ; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 @@ -16512,7 +16440,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB122_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB122_3: ; %atomicrmw.private -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12-NEXT: s_cselect_b32 s6, s0, -1 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 @@ -16520,11 +16447,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll index 822d40f7349b0..52a23690dcf53 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -250,6 +250,7 @@ define i32 @test_D139469_f16(half %arg) { ; GFX12-SDAG-NEXT: v_min_num_f16_e32 v0, v2, v1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -400,9 +401,11 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX12-SDAG-NEXT: v_pk_min_num_f16 v0, v1, v0 ; GFX12-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index f0fa621e3b4bc..53f580cc59592 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -80,6 +80,7 @@ define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inre ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_maximum3_f32 v0, s0, s1, v0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; ; GFX940-LABEL: s_fmaximum3_f32: @@ -1251,19 +1252,27 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) ret half %max1 @@ -1280,19 +1289,27 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v2, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_commute: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v2, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v2, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_commute: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %c, half %max0) ret half %max1 @@ -1307,24 +1324,37 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_fmaximum3_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_max_f16_e32 v1, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_max_f16_e32 v1, s2, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX942-LABEL: s_fmaximum3_f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_max_f16_e32 v1, s0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_max_f16_e32 v1, s2, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_fmaximum3_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s2, s2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) %cast = bitcast half %max1 to i16 @@ -1344,19 +1374,28 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, |v0|, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fabs0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, |v0|, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fabs0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, |v0|, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fabs0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) @@ -1374,19 +1413,28 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, |v1|, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fabs1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, v0, |v1| -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fabs1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, v0, |v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fabs1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %b.fabs = call half @llvm.fabs.f16(half %b) %max0 = call half @llvm.maximum.f16(half %a, half %b.fabs) %max1 = call half @llvm.maximum.f16(half %max0, half %c) @@ -1404,19 +1452,28 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, |v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fabs2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2| -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fabs2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e64 v1, v0, |v2| +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fabs2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %c.fabs = call half @llvm.fabs.f16(half %c) %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs) @@ -1434,19 +1491,30 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, |v0|, |v1|, |v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, |v0|, |v1| -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2| -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, |v0|, |v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e64 v1, v0, |v2| +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %b.fabs = call half @llvm.fabs.f16(half %b) %c.fabs = call half @llvm.fabs.f16(half %c) @@ -1466,19 +1534,30 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, -v0, -v1, -v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fneg_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, -v0, -v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e64 v1, v0, -v2 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, -v0, -v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e64 v1, v0, -v2 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fneg_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a %b.fneg = fneg half %b %c.fneg = fneg half %c @@ -1498,19 +1577,30 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, -|v0|, -|v1|, -|v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fneg_fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, -|v0|, -|v1| -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e64 v1, v0, -|v2| -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fneg_fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, -|v0|, -|v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e64 v1, v0, -|v2| +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fneg_fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX950-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_or_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %b.fabs = call half @llvm.fabs.f16(half %b) %c.fabs = call half @llvm.fabs.f16(half %c) @@ -1533,19 +1623,28 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, -v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fneg0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, -v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fneg0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, -v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fneg0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) @@ -1563,19 +1662,28 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, -v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fneg1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, v0, -v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fneg1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, v0, -v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fneg1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg half %b %max0 = call half @llvm.maximum.f16(half %a, half %b.fneg) %max1 = call half @llvm.maximum.f16(half %max0, half %c) @@ -1593,19 +1701,28 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, -v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fneg2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e64 v1, v0, -v2 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fneg2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e64 v1, v0, -v2 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fneg2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg half %c %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg) @@ -1623,19 +1740,28 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, 0x4800, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_const0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v2, 0x4800, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_const0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v2, 0x4800, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_const0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_movk_i32 s0, 0x4800 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half 8.0, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) ret half %max1 @@ -1652,19 +1778,27 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, 0x4800 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16__const2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_max_f16_e32 v1, 0x4800, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16__const2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_max_f16_e32 v1, 0x4800, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16__const2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_movk_i32 s0, 0x4800 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half 8.0) ret half %max1 @@ -1681,19 +1815,27 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, 4.0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_inlineimm0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v2, 4.0, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_inlineimm0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v2, 4.0, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_inlineimm0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half 4.0, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) ret half %max1 @@ -1710,19 +1852,27 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16__inlineimm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_max_f16_e32 v1, 4.0, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16__inlineimm: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_max_f16_e32 v1, 4.0, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16__inlineimm: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half 4.0) ret half %max1 @@ -1741,19 +1891,28 @@ define half @v_fmaximum3_f16_const1_const2(half %a) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, s0, 0x4c00 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_const1_const2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v1, 0x4800, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_max_f16_e32 v1, 0x4c00, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_const1_const2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v1, 0x4800, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_max_f16_e32 v1, 0x4c00, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_const1_const2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_movk_i32 s0, 0x4800 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_movk_i32 s0, 0x4c00 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half 8.0) %max1 = call half @llvm.maximum.f16(half %max0, half 16.0) ret half %max1 @@ -3005,6 +3164,7 @@ define amdgpu_ps <2 x i32> @s_fmaximum3_f64(double inreg %a, double inreg %b, do ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_readfirstlane_b32 s1, v1 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fmaximum3_f64: @@ -3620,20 +3780,30 @@ define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) { ; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_no_fmaximum3_f16__multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_no_fmaximum3_f16__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX942-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_no_fmaximum3_f16__multi_use: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v0, v2, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) %insert.0 = insertelement <2 x half> poison, half %max0, i32 0 @@ -3651,23 +3821,35 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f16__multi_use(half inreg %a, half in ; GFX12-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX12-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_no_fmaximum3_f16__multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_max_f16_e32 v1, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_max_f16_e32 v1, s2, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: ; return to shader part epilog +; GFX942-LABEL: s_no_fmaximum3_f16__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_max_f16_e32 v1, s0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_max_f16_e32 v1, s2, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: v_readfirstlane_b32 s1, v1 +; GFX942-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_no_fmaximum3_f16__multi_use: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v0, s2, s2 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: v_readfirstlane_b32 s1, v1 +; GFX950-NEXT: ; return to shader part epilog %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) %cast0 = bitcast half %max0 to i16 diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 7a8a224c76a83..dd7af4495e6a3 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -80,6 +80,7 @@ define amdgpu_ps i32 @s_fminimum3_f32(float inreg %a, float inreg %b, float inre ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_minimum3_f32 v0, s0, s1, v0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; ; GFX940-LABEL: s_fminimum3_f32: @@ -1251,19 +1252,27 @@ define half @v_fminimum3_f16(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) ret half %max1 @@ -1280,19 +1289,27 @@ define half @v_fminimum3_f16_commute(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v2, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_commute: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v2, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v2, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_commute: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v2, v0, v0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %c, half %max0) ret half %max1 @@ -1307,24 +1324,37 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_fminimum3_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_min_f16_e32 v1, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_min_f16_e32 v1, s2, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX942-LABEL: s_fminimum3_f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_min_f16_e32 v1, s0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_min_f16_e32 v1, s2, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_fminimum3_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s2, s2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) %cast = bitcast half %max1 to i16 @@ -1344,19 +1374,28 @@ define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, |v0|, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fabs0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, |v0|, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fabs0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, |v0|, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fabs0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %max0 = call half @llvm.minimum.f16(half %a.fabs, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) @@ -1374,19 +1413,28 @@ define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, |v1|, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fabs1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, v0, |v1| -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fabs1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, v0, |v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fabs1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %b.fabs = call half @llvm.fabs.f16(half %b) %max0 = call half @llvm.minimum.f16(half %a, half %b.fabs) %max1 = call half @llvm.minimum.f16(half %max0, half %c) @@ -1404,19 +1452,28 @@ define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, |v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fabs2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e64 v1, v0, |v2| -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fabs2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e64 v1, v0, |v2| +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fabs2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %c.fabs = call half @llvm.fabs.f16(half %c) %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c.fabs) @@ -1434,19 +1491,30 @@ define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, |v0|, |v1|, |v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, |v0|, |v1| -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e64 v1, v0, |v2| -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, |v0|, |v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e64 v1, v0, |v2| +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %b.fabs = call half @llvm.fabs.f16(half %b) %c.fabs = call half @llvm.fabs.f16(half %c) @@ -1466,19 +1534,30 @@ define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, -v0, -v1, -v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fneg_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, -v0, -v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e64 v1, v0, -v2 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, -v0, -v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e64 v1, v0, -v2 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fneg_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a %b.fneg = fneg half %b %c.fneg = fneg half %c @@ -1498,19 +1577,30 @@ define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, -|v0|, -|v1|, -|v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fneg_fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, -|v0|, -|v1| -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e64 v1, v0, -|v2| -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fneg_fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, -|v0|, -|v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e64 v1, v0, -|v2| +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fneg_fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX950-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_or_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %b.fabs = call half @llvm.fabs.f16(half %b) %c.fabs = call half @llvm.fabs.f16(half %c) @@ -1533,19 +1623,28 @@ define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, -v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fneg0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, -v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fneg0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, -v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fneg0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a %max0 = call half @llvm.minimum.f16(half %a.fneg, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) @@ -1563,19 +1662,28 @@ define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, -v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fneg1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, v0, -v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fneg1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, v0, -v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fneg1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg half %b %max0 = call half @llvm.minimum.f16(half %a, half %b.fneg) %max1 = call half @llvm.minimum.f16(half %max0, half %c) @@ -1593,19 +1701,28 @@ define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, -v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fneg2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e64 v1, v0, -v2 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fneg2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e64 v1, v0, -v2 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fneg2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg half %c %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg) @@ -1623,19 +1740,28 @@ define half @v_fminimum3_f16_const0(half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, 0x4800, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_const0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v2, 0x4800, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_const0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v2, 0x4800, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_const0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_movk_i32 s0, 0x4800 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half 8.0, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) ret half %max1 @@ -1652,19 +1778,27 @@ define half @v_fminimum3_f16__const2(half %a, half %b) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, 0x4800 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16__const2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_min_f16_e32 v1, 0x4800, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16__const2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_min_f16_e32 v1, 0x4800, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16__const2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_movk_i32 s0, 0x4800 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half 8.0) ret half %max1 @@ -1681,19 +1815,27 @@ define half @v_fminimum3_f16_inlineimm0(half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, 4.0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_inlineimm0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v2, 4.0, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_inlineimm0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v2, 4.0, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_inlineimm0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half 4.0, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) ret half %max1 @@ -1710,19 +1852,27 @@ define half @v_fminimum3_f16__inlineimm(half %a, half %b) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16__inlineimm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_min_f16_e32 v1, 4.0, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16__inlineimm: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_min_f16_e32 v1, 4.0, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16__inlineimm: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half 4.0) ret half %max1 @@ -1741,19 +1891,28 @@ define half @v_fminimum3_f16_const1_const2(half %a) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, s0, 0x4c00 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_const1_const2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v1, 0x4800, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_min_f16_e32 v1, 0x4c00, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_const1_const2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v1, 0x4800, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_min_f16_e32 v1, 0x4c00, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_const1_const2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_movk_i32 s0, 0x4800 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_movk_i32 s0, 0x4c00 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half 8.0) %max1 = call half @llvm.minimum.f16(half %max0, half 16.0) ret half %max1 @@ -3005,6 +3164,7 @@ define amdgpu_ps <2 x i32> @s_fminimum3_f64(double inreg %a, double inreg %b, do ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_readfirstlane_b32 s1, v1 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fminimum3_f64: @@ -3620,20 +3780,30 @@ define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) { ; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_no_fminimum3_f16__multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_no_fminimum3_f16__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX942-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_no_fminimum3_f16__multi_use: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v0, v2, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) %insert.0 = insertelement <2 x half> poison, half %max0, i32 0 @@ -3651,23 +3821,35 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f16__multi_use(half inreg %a, half in ; GFX12-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX12-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_no_fminimum3_f16__multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_min_f16_e32 v1, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_min_f16_e32 v1, s2, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: ; return to shader part epilog +; GFX942-LABEL: s_no_fminimum3_f16__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_min_f16_e32 v1, s0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_min_f16_e32 v1, s2, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: v_readfirstlane_b32 s1, v1 +; GFX942-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_no_fminimum3_f16__multi_use: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v0, s2, s2 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: v_readfirstlane_b32 s1, v1 +; GFX950-NEXT: ; return to shader part epilog %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) %cast0 = bitcast half %max0 to i16 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index b32630a97b3ad..f4c5ebd8b3cf5 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -3000,18 +3000,19 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-NEXT: v_and_b32_e32 v4, 7, v2 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; SI-NEXT: s_or_b64 vcc, s[4:5], vcc -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_mov_b32_e32 v5, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; SI-NEXT: s_movk_i32 s4, 0x40f -; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -3049,18 +3050,19 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; VI-NEXT: v_and_b32_e32 v4, 7, v2 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v4, v4, v5 ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; VI-NEXT: s_or_b64 vcc, s[4:5], vcc -; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; VI-NEXT: v_mov_b32_e32 v5, 0x7e00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; VI-NEXT: s_movk_i32 s4, 0x40f -; VI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-NEXT: v_mov_b32_e32 v2, 0x8000 @@ -3085,8 +3087,7 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13 ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3094,21 +3095,23 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 9, 0x7c00 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4 -; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 @@ -3149,18 +3152,19 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-NEXT: v_and_b32_e32 v4, 7, v2 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; SI-NEXT: s_or_b64 vcc, s[4:5], vcc -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_mov_b32_e32 v5, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; SI-NEXT: s_movk_i32 s4, 0x40f -; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -3196,18 +3200,19 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; VI-NEXT: v_and_b32_e32 v4, 7, v2 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v4, v4, v5 ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; VI-NEXT: s_or_b64 vcc, s[4:5], vcc -; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; VI-NEXT: v_mov_b32_e32 v5, 0x7e00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; VI-NEXT: s_movk_i32 s4, 0x40f -; VI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-NEXT: v_mov_b32_e32 v2, 0x8000 @@ -3229,9 +3234,8 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v2, v0 ; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 @@ -3242,20 +3246,22 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 ; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 9, 0x7c00 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo @@ -3298,18 +3304,19 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; SI-NEXT: v_and_b32_e32 v5, 7, v2 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v5 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; SI-NEXT: s_or_b64 vcc, s[4:5], vcc -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; SI-NEXT: v_mov_b32_e32 v5, 0x7c00 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 ; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v6, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; SI-NEXT: s_movk_i32 s4, 0x40f -; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 @@ -3349,18 +3356,19 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; VI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; VI-NEXT: v_and_b32_e32 v6, 7, v4 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v6 +; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v6, v6, v7 ; VI-NEXT: v_lshrrev_b32_e32 v4, 2, v4 -; VI-NEXT: s_or_b64 vcc, s[4:5], vcc -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_mov_b32_e32 v6, 0x7c00 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 ; VI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; VI-NEXT: v_mov_b32_e32 v7, 0x7e00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; VI-NEXT: s_movk_i32 s4, 0x40f -; VI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v5 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-NEXT: v_mov_b32_e32 v4, 0x8000 @@ -3383,9 +3391,8 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v2, 0xffe, v3, v2 ; GFX11-NEXT: v_med3_i32 v3, v5, 0, 13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v5, 0x1000, v2 -; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, v3, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v3, v6 @@ -3396,27 +3403,29 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0xfffffc10, v4 ; GFX11-NEXT: v_lshl_or_b32 v5, v4, 12, v2 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v2, v2, 9, 0x7c00 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v5, 7, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v6, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v6, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_or_b32 v3, 0x8000, v5, v2 ; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg.a = fneg double %a @@ -3456,18 +3465,19 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; SI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; SI-NEXT: v_and_b32_e32 v7, 7, v5 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v7 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 -; SI-NEXT: s_or_b64 vcc, s[4:5], vcc -; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; SI-NEXT: v_mov_b32_e32 v7, 0x7c00 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 ; SI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; SI-NEXT: v_mov_b32_e32 v8, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_lshlrev_b32_e32 v4, 9, v4 ; SI-NEXT: s_movk_i32 s4, 0x40f -; SI-NEXT: v_or_b32_e32 v4, 0x7c00, v4 +; SI-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v6 ; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 @@ -3506,19 +3516,20 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; VI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; VI-NEXT: v_and_b32_e32 v7, 7, v5 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v7 -; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 -; VI-NEXT: s_or_b64 vcc, s[4:5], vcc -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; VI-NEXT: v_mul_f64 v[2:3], -v[0:1], v[2:3] +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v7 ; VI-NEXT: v_mov_b32_e32 v7, 0x7c00 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 ; VI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; VI-NEXT: v_mov_b32_e32 v8, 0x7e00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v4, 9, v4 ; VI-NEXT: s_movk_i32 s4, 0x40f -; VI-NEXT: v_or_b32_e32 v4, 0x7c00, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v6 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 0x8000 @@ -3537,42 +3548,43 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; GFX11-NEXT: v_mul_f64 v[2:3], -v[0:1], v[2:3] ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v7, 0x3f1, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xfffffc10, v6 ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-NEXT: v_and_or_b32 v4, 0xffe, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_med3_i32 v5, v7, 0, 13 ; GFX11-NEXT: v_or_b32_e32 v7, 0x1000, v4 -; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v8, v5, v7 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v5, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v7 -; GFX11-NEXT: v_lshl_or_b32 v7, v6, 12, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v6 -; GFX11-NEXT: v_lshl_or_b32 v4, v4, 9, 0x7c00 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v5, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xfffffc10, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v5, 7, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, 0, v0, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v7, v6, 12, v4 +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 7, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 2, v5 +; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX11-NEXT: v_dual_mov_b32 v7, 0x7e00 :: v_dual_add_nc_u32 v0, v5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7c00, v7, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 ; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3613,18 +3625,19 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-NEXT: v_and_b32_e32 v4, 7, v2 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; SI-NEXT: s_or_b64 vcc, s[4:5], vcc -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_mov_b32_e32 v5, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; SI-NEXT: s_movk_i32 s4, 0x40f -; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -3662,18 +3675,19 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; VI-NEXT: v_and_b32_e32 v4, 7, v2 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v4, v4, v5 ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; VI-NEXT: s_or_b64 vcc, s[4:5], vcc -; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; VI-NEXT: v_mov_b32_e32 v5, 0x7e00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; VI-NEXT: s_movk_i32 s4, 0x40f -; VI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; VI-NEXT: v_mov_b32_e32 v2, 0x8000 @@ -3696,9 +3710,8 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v2, v0 ; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 @@ -3709,20 +3722,22 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 ; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 9, 0x7c00 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index b2d30b751ae2c..e1791daa3aa0c 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -828,9 +828,9 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) { ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GCN-NEXT: v_bfrev_b32_e32 v2, 1 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -839,14 +839,14 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 1, v0 -; GFX11-NEXT: v_cmp_lt_i32_e64 s0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_cndmask_b32 v1, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 31, v5 -; GFX11-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 0x80000000, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v1, v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %i = and i32 %arg, 1 %i3 = icmp eq i32 %i, 0 @@ -907,12 +907,13 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) { ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff8000 ; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, 1, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 31, v2 -; GFX7-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX7-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -922,9 +923,9 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) { ; GFX9-NEXT: v_and_b32_e32 v3, 1, v0 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff8000 ; GFX9-NEXT: v_cmp_lt_i16_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 15, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GFX9-NEXT: v_xor_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 @@ -934,16 +935,14 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v3, 1, v0 -; GFX11-NEXT: v_cmp_lt_i16_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3 -; GFX11-NEXT: v_lshlrev_b16 v0, 15, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_lt_i16_e32 vcc_lo, 1, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 0xffff8000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v0, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %i = and i16 %arg, 1 diff --git a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir index 099b066cde255..3ac463b4fb448 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir @@ -267,8 +267,8 @@ body: | # GCN: %0:vgpr_32 = COPY $vgpr0 # GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec # GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 -# GCN-NEXT: %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec -# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %3:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec +# GCN-NEXT: %4:sreg_32_xm0 = S_MOV_B32 0 --- name: fold-imm-readfirstlane-regsequence0 tracksRegLiveness: true @@ -278,8 +278,8 @@ body: | %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 - %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec - %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec + %3:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec S_NOP 0, implicit %3, implicit %4 ... @@ -288,8 +288,8 @@ body: | # GCN: %0:vgpr_32 = COPY $vgpr0 # GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec # GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, killed %0, %subreg.sub1 -# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 -# GCN-NEXT: %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec +# GCN-NEXT: %3:sreg_32_xm0 = S_MOV_B32 0 +# GCN-NEXT: %4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec --- name: fold-imm-readfirstlane-regsequence1 @@ -300,8 +300,8 @@ body: | %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %2:vreg_64 = REG_SEQUENCE %1:vgpr_32, %subreg.sub0, killed %0:vgpr_32, %subreg.sub1 - %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec - %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec + %3:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec S_NOP 0, implicit %3, implicit %4 ... @@ -310,8 +310,8 @@ body: | # GCN: %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec # GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec # GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 -# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 -# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 1 +# GCN-NEXT: %3:sreg_32_xm0 = S_MOV_B32 0 +# GCN-NEXT: %4:sreg_32_xm0 = S_MOV_B32 1 --- name: fold-imm-readfirstlane-regsequence2 tracksRegLiveness: true @@ -320,8 +320,8 @@ body: | %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 - %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec - %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec + %3:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec S_NOP 0, implicit %3, implicit %4 ... @@ -330,8 +330,8 @@ body: | # GCN: %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec # GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec # GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 -# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 -# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %3:sreg_32_xm0 = S_MOV_B32 0 +# GCN-NEXT: %4:sreg_32_xm0 = S_MOV_B32 0 --- name: fold-imm-readfirstlane-regsequence3 tracksRegLiveness: true @@ -340,8 +340,8 @@ body: | %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 - %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec - %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec + %3:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec S_NOP 0, implicit %3, implicit %4 ... @@ -350,8 +350,8 @@ body: | # GCN: %0:vgpr_32 = COPY $sgpr10 # GCN-NEXT: %1:vgpr_32 = COPY $sgpr11 # GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 -# GCN-NEXT: %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec -# GCN-NEXT: %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec +# GCN-NEXT: %3:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec +# GCN-NEXT: %4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec --- name: fold-copy-readfirstlane-regsequence0 tracksRegLiveness: true @@ -361,8 +361,8 @@ body: | %0:vgpr_32 = COPY $sgpr10 %1:vgpr_32 = COPY $sgpr11 %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 - %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec - %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec + %3:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sreg_32_xm0 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec ... # GCN-LABEL: name: fold-copy-readfirstlane-regsequence1{{$}} @@ -371,8 +371,8 @@ body: | # GCN-NEXT: %2:vgpr_32 = COPY %0 # GCN-NEXT: %3:vgpr_32 = COPY %1 # GCN-NEXT: %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, killed %3, %subreg.sub1 -# GCN-NEXT: %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec -# GCN-NEXT: %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec +# GCN-NEXT: %5:sreg_32_xm0 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec +# GCN-NEXT: %6:sreg_32_xm0 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec --- name: fold-copy-readfirstlane-regsequence1 tracksRegLiveness: true @@ -384,6 +384,6 @@ body: | %2:vgpr_32 = COPY %0 %3:vgpr_32 = COPY %1 %4:vreg_64 = REG_SEQUENCE %2:vgpr_32, %subreg.sub0, killed %3:vgpr_32, %subreg.sub1 - %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0:vreg_64, implicit $exec - %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1:vreg_64, implicit $exec + %5:sreg_32_xm0 = V_READFIRSTLANE_B32 %4.sub0:vreg_64, implicit $exec + %6:sreg_32_xm0 = V_READFIRSTLANE_B32 %4.sub1:vreg_64, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index 188b2ada64686..6ab17c91c1439 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -654,7 +654,9 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] ; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v2 +; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX950-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX950-SDAG-NEXT: s_endpgm ; @@ -666,11 +668,11 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] -; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v2 +; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX950-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX950-GISEL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 3d3e8bea7e33e..4bab6eaab6f7d 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -97,59 +97,53 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_movk_i32 s2, 0x7e00 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_lshr_b32 s4, s7, 8 -; SI-NEXT: s_and_b32 s5, s7, 0x1ff -; SI-NEXT: s_and_b32 s8, s4, 0xffe -; SI-NEXT: s_or_b32 s4, s5, s6 -; SI-NEXT: s_cmp_lg_u32 s4, 0 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: s_bfe_u32 s4, s7, 0xb0014 -; SI-NEXT: v_readfirstlane_b32 s5, v0 -; SI-NEXT: s_sub_i32 s6, 0x3f1, s4 -; SI-NEXT: s_add_i32 s10, s4, 0xfffffc10 -; SI-NEXT: s_or_b32 s11, s8, s5 +; SI-NEXT: s_lshr_b32 s0, s7, 8 +; SI-NEXT: s_and_b32 s1, s7, 0x1ff +; SI-NEXT: s_and_b32 s8, s0, 0xffe +; SI-NEXT: s_or_b32 s0, s1, s6 +; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 +; SI-NEXT: v_readfirstlane_b32 s1, v0 +; SI-NEXT: s_sub_i32 s6, 0x3f1, s0 +; SI-NEXT: s_or_b32 s1, s8, s1 ; SI-NEXT: v_med3_i32 v0, s6, 0, 13 -; SI-NEXT: s_lshl_b32 s4, s10, 12 -; SI-NEXT: s_or_b32 s5, s11, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s6, v0 -; SI-NEXT: s_or_b32 s4, s11, s4 -; SI-NEXT: s_lshr_b32 s6, s5, s6 -; SI-NEXT: v_lshl_b32_e32 v0, s6, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, s5, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_readfirstlane_b32 s5, v0 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_cmp_lt_i32 s10, 1 -; SI-NEXT: s_cselect_b32 s6, s5, s4 +; SI-NEXT: s_or_b32 s6, s1, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: s_lshr_b32 s9, s6, s8 +; SI-NEXT: s_lshl_b32 s8, s9, s8 +; SI-NEXT: s_cmp_lg_u32 s8, s6 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_addk_i32 s0, 0xfc10 +; SI-NEXT: s_or_b32 s6, s9, s6 +; SI-NEXT: s_lshl_b32 s8, s0, 12 +; SI-NEXT: s_or_b32 s8, s1, s8 +; SI-NEXT: s_cmp_lt_i32 s0, 1 +; SI-NEXT: s_cselect_b32 s6, s6, s8 ; SI-NEXT: s_and_b32 s8, s6, 7 ; SI-NEXT: s_cmp_gt_i32 s8, 5 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_cselect_b32 s9, 1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 3 -; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; SI-NEXT: s_cselect_b32 s8, 1, 0 ; SI-NEXT: s_lshr_b32 s6, s6, 2 -; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_cmp_lg_u32 s4, 0 -; SI-NEXT: s_addc_u32 s4, s6, 0 -; SI-NEXT: s_cmp_lt_i32 s10, 31 -; SI-NEXT: s_cselect_b32 s6, s4, 0x7c00 -; SI-NEXT: s_cmp_lg_u32 s11, 0 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: s_cmpk_eq_i32 s10, 0x40f -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 -; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SI-NEXT: s_and_b32 s4, s4, 0x8000 -; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_add_i32 s6, s6, s8 +; SI-NEXT: s_cmp_lt_i32 s0, 31 +; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s1, 0 +; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s0, 0x40f +; SI-NEXT: s_cselect_b32 s0, s1, s6 +; SI-NEXT: s_lshr_b32 s1, s7, 16 +; SI-NEXT: s_and_b32 s1, s1, 0x8000 +; SI-NEXT: s_or_b32 s6, s1, s0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -169,46 +163,41 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; VI-SAFE-SDAG-NEXT: s_bfe_u32 s5, s7, 0xb0014 -; VI-SAFE-SDAG-NEXT: s_or_b32 s6, s8, s4 -; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s5 +; VI-SAFE-SDAG-NEXT: s_bfe_u32 s6, s7, 0xb0014 +; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s8, s4 +; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s6 ; VI-SAFE-SDAG-NEXT: v_med3_i32 v0, s8, 0, 13 -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s6, 0x1000 +; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 ; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s8, v0 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s8, s4, s8 -; VI-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v0, s8 -; VI-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 -; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-SAFE-SDAG-NEXT: s_add_i32 s10, s5, 0xfffffc10 -; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; VI-SAFE-SDAG-NEXT: s_lshl_b32 s5, s10, 12 -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s8, s4 -; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5 -; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s10, 1 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s11, s4, s5 -; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s11, 7 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s9, s5, s8 +; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s9, s8 +; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s8, s5 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 +; VI-SAFE-SDAG-NEXT: s_addk_i32 s6, 0xfc10 +; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s6, 12 +; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s9, s5 +; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s4, s8 +; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 1 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s8 +; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s5, 7 ; VI-SAFE-SDAG-NEXT: s_cmp_gt_i32 s8, 5 -; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s9, 1, 0 ; VI-SAFE-SDAG-NEXT: s_cmp_eq_u32 s8, 3 -; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[8:9], -1, 0 -; VI-SAFE-SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s8, s11, 2 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; VI-SAFE-SDAG-NEXT: s_addc_u32 s4, s8, 0 -; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s10, 31 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s8, s4, 0x7c00 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0 -; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; VI-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v0, 9, v0 -; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f -; VI-SAFE-SDAG-NEXT: v_or_b32_e32 v0, 0x7c00, v0 -; VI-SAFE-SDAG-NEXT: v_mov_b32_e32 v1, s8 -; VI-SAFE-SDAG-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s4, s7, 16 -; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; VI-SAFE-SDAG-NEXT: s_and_b32 s4, s4, 0x8000 -; VI-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s8, s9 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 +; VI-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s8 +; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 31 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 +; VI-SAFE-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s6, 0x40f +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, s5 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s7, 16 +; VI-SAFE-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 +; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s5, s4 +; VI-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; VI-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-SAFE-SDAG-NEXT: s_endpgm ; @@ -299,45 +288,41 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 ; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2 -; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 ; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13 ; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 -; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s7, s2, 12 ; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1 ; GFX10-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5 ; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s7, s4, s7 -; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s6, s5, s6 -; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s6 -; GFX10-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0 -; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5 +; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 +; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6 +; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5 +; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6 ; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s7 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6 ; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 ; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0 ; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0 ; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 ; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 -; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-SAFE-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6 ; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 +; GFX10-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00 ; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 ; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00 ; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f -; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s2, s3, 16 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5 +; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2 ; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-SAFE-SDAG-NEXT: s_and_b32 s2, s2, 0x8000 -; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v0, 9, v0 -; GFX10-SAFE-SDAG-NEXT: v_or_b32_e32 v0, 0x7c00, v0 -; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo -; GFX10-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX10-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-SAFE-SDAG-NEXT: s_endpgm @@ -430,53 +415,50 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 ; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2 -; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13 ; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s7, s2, 12 ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1 ; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s7, s4, s7 -; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s6, s5, s6 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s6 -; GFX11-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0 -; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5 -; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s7 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6 +; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5 +; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6 +; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6 ; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0 ; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0 ; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 ; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0 -; GFX11-SAFE-SDAG-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6 ; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 +; GFX11-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00 ; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 ; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00 ; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f -; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s2, s3, 16 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5 +; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2 ; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-SAFE-SDAG-NEXT: s_and_b32 s2, s2, 0x8000 -; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v0, 9, v0 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, 0x7c00, v0 -; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX11-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-SAFE-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll new file mode 100644 index 0000000000000..e5815e96fbe33 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s + +define <2 x half> @v_test_cvt_v2f32_v2f16(<2 x float> %src) { +; GFX950-LABEL: v_test_cvt_v2f32_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] + %res = fptrunc <2 x float> %src to <2 x half> + ret <2 x half> %res +} + +define half @fptrunc_v2f32_v2f16_then_extract(<2 x float> %src) { +; GFX950-LABEL: fptrunc_v2f32_v2f16_then_extract: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1 +; GFX950-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX950-NEXT: s_setpc_b64 s[30:31] + %vec_half = fptrunc <2 x float> %src to <2 x half> + %first = extractelement <2 x half> %vec_half, i64 1 + %second = extractelement <2 x half> %vec_half, i64 0 + %res = fadd half %first, %second + ret half %res +} + +define <2 x half> @v_test_cvt_v2f64_v2f16(<2 x double> %src) { +; GFX950-SDAG-LABEL: v_test_cvt_v2f64_v2f16: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0x1ff +; GFX950-SDAG-NEXT: v_and_or_b32 v0, v1, s0, v0 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX950-SDAG-NEXT: s_movk_i32 s1, 0xffe +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-SDAG-NEXT: v_bfe_u32 v5, v1, 20, 11 +; GFX950-SDAG-NEXT: v_and_or_b32 v0, v4, s1, v0 +; GFX950-SDAG-NEXT: v_sub_u32_e32 v6, 0x3f1, v5 +; GFX950-SDAG-NEXT: v_or_b32_e32 v4, 0x1000, v0 +; GFX950-SDAG-NEXT: v_med3_i32 v6, v6, 0, 13 +; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v7, v6, v4 +; GFX950-SDAG-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4 +; GFX950-SDAG-NEXT: v_add_u32_e32 v5, 0xfffffc10, v5 +; GFX950-SDAG-NEXT: v_lshl_or_b32 v6, v5, 12, v0 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX950-SDAG-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX950-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0x40f +; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX950-SDAG-NEXT: v_and_b32_e32 v6, 7, v4 +; GFX950-SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; GFX950-SDAG-NEXT: s_mov_b32 s3, 0x8000 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX950-SDAG-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX950-SDAG-NEXT: v_add_u32_e32 v4, v4, v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0x7c00 +; GFX950-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v5 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-SDAG-NEXT: v_and_or_b32 v0, v1, s3, v0 +; GFX950-SDAG-NEXT: v_and_or_b32 v1, v3, s0, v2 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX950-SDAG-NEXT: v_bfe_u32 v4, v3, 20, 11 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX950-SDAG-NEXT: v_and_or_b32 v1, v2, s1, v1 +; GFX950-SDAG-NEXT: v_sub_u32_e32 v5, 0x3f1, v4 +; GFX950-SDAG-NEXT: v_or_b32_e32 v2, 0x1000, v1 +; GFX950-SDAG-NEXT: v_med3_i32 v5, v5, 0, 13 +; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v8, v5, v2 +; GFX950-SDAG-NEXT: v_lshlrev_b32_e32 v5, v5, v8 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 +; GFX950-SDAG-NEXT: v_add_u32_e32 v4, 0xfffffc10, v4 +; GFX950-SDAG-NEXT: v_lshl_or_b32 v5, v4, 12, v1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX950-SDAG-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX950-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX950-SDAG-NEXT: v_and_b32_e32 v5, 7, v2 +; GFX950-SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX950-SDAG-NEXT: v_or_b32_e32 v5, v5, v8 +; GFX950-SDAG-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX950-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v4 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX950-SDAG-NEXT: v_and_or_b32 v1, v2, s3, v1 +; GFX950-SDAG-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: v_test_cvt_v2f64_v2f16: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 0x1ff +; GFX950-GISEL-NEXT: v_and_or_b32 v0, v1, v7, v0 +; GFX950-GISEL-NEXT: v_bfe_u32 v4, v1, 20, 11 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX950-GISEL-NEXT: v_add_u32_e32 v4, 0xfffffc10, v4 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0xffe +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-GISEL-NEXT: v_and_or_b32 v0, v5, v6, v0 +; GFX950-GISEL-NEXT: v_sub_u32_e32 v10, 1, v4 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX950-GISEL-NEXT: v_lshl_or_b32 v9, v4, 12, v0 +; GFX950-GISEL-NEXT: v_med3_i32 v10, v10, 0, 13 +; GFX950-GISEL-NEXT: v_or_b32_e32 v0, 0x1000, v0 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v11, v10, v0 +; GFX950-GISEL-NEXT: v_lshlrev_b32_e32 v10, v10, v11 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, v10, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, 0x7c00 +; GFX950-GISEL-NEXT: v_lshl_or_b32 v5, v5, 9, v8 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-GISEL-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX950-GISEL-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX950-GISEL-NEXT: v_and_or_b32 v2, v3, v7, v2 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GFX950-GISEL-NEXT: v_and_b32_e32 v9, 7, v0 +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 +; GFX950-GISEL-NEXT: v_cmp_lt_i32_e64 s[0:1], 5, v9 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v0, 2, v0 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX950-GISEL-NEXT: v_add_u32_e32 v0, v0, v9 +; GFX950-GISEL-NEXT: v_cmp_lt_i32_e32 vcc, 30, v4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, 0x40f +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v9 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0x8000 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX950-GISEL-NEXT: v_and_or_b32 v0, v1, v4, v0 +; GFX950-GISEL-NEXT: v_bfe_u32 v1, v3, 20, 11 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX950-GISEL-NEXT: v_add_u32_e32 v1, 0xfffffc10, v1 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX950-GISEL-NEXT: v_and_or_b32 v2, v5, v6, v2 +; GFX950-GISEL-NEXT: v_sub_u32_e32 v7, 1, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX950-GISEL-NEXT: v_lshl_or_b32 v6, v1, 12, v2 +; GFX950-GISEL-NEXT: v_med3_i32 v7, v7, 0, 13 +; GFX950-GISEL-NEXT: v_or_b32_e32 v2, 0x1000, v2 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v10, v7, v2 +; GFX950-GISEL-NEXT: v_lshlrev_b32_e32 v7, v7, v10 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, v7, v2 +; GFX950-GISEL-NEXT: v_lshl_or_b32 v5, v5, 9, v8 +; GFX950-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX950-GISEL-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX950-GISEL-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX950-GISEL-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; GFX950-GISEL-NEXT: v_cmp_lt_i32_e64 s[0:1], 5, v6 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX950-GISEL-NEXT: v_add_u32_e32 v2, v2, v6 +; GFX950-GISEL-NEXT: v_cmp_lt_i32_e32 vcc, 30, v1 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v9 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX950-GISEL-NEXT: v_and_or_b32 v1, v2, v4, v1 +; GFX950-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %res = fptrunc <2 x double> %src to <2 x half> + ret <2 x half> %res +} diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index 80b4d64b1236f..8188aeae5438b 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -115,6 +115,7 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) nocapture writeonly ; GFX12-NEXT: v_fract_f32_e32 v3, v0 ; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| ; GFX12-NEXT: v_floor_f32_e32 v4, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-NEXT: global_store_b32 v[1:2], v4, off @@ -301,10 +302,11 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) nocapture w ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_floor_f32_e32 v3, v0 ; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v4, v0, v3 ; GFX12-NEXT: global_store_b32 v[1:2], v3, off ; GFX12-NEXT: v_min_num_f32_e32 v4, 0x3f7fffff, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -1423,6 +1425,7 @@ define float @wrong_commuted_nan_select_f32(float %x) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v1, v0, v1 ; GFX12-NEXT: v_min_num_f32_e32 v1, 0x3f7fffff, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2118,10 +2121,11 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap ; GFX12-NEXT: v_fract_f32_e32 v7, v1 ; GFX12-NEXT: v_floor_f32_e32 v4, v0 ; GFX12-NEXT: v_floor_f32_e32 v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 ; GFX12-NEXT: v_cmp_class_f32_e64 s0, v1, 0x204 ; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v1, v7, 0, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: @@ -2247,6 +2251,7 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon ; GFX12-NEXT: v_fract_f64_e32 v[4:5], v[0:1] ; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| ; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 ; GFX12-NEXT: global_store_b64 v[2:3], v[6:7], off @@ -2383,6 +2388,7 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly % ; GFX12-NEXT: v_fract_f16_e32 v3, v0 ; GFX12-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0| ; GFX12-NEXT: v_floor_f16_e32 v4, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-NEXT: global_store_b16 v[1:2], v4, off @@ -2564,14 +2570,16 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX12-NEXT: v_fract_f16_e32 v6, v0 ; GFX12-NEXT: v_floor_f16_e32 v5, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_fract_f16_e32 v4, v3 ; GFX12-NEXT: v_cmp_class_f16_e64 s0, v3, 0x204 ; GFX12-NEXT: v_floor_f16_e32 v7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 ; GFX12-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_pack_b32_f16 v4, v5, v7 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 ; GFX12-NEXT: global_store_b32 v[1:2], v4, off ; GFX12-NEXT: v_pack_b32_f16 v0, v0, v3 @@ -2733,6 +2741,7 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc ; GFX12-NEXT: v_cmp_class_f64_e64 s1, v[2:3], 0x204 ; GFX12-NEXT: v_floor_f64_e32 v[8:9], v[2:3] ; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v0, v10, 0, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v1, v11, 0, s0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v12, 0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll index c1d5b5857b6b5..87c7cce854b11 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll @@ -1837,8 +1837,7 @@ define float @v_sqrt_f32_ulp2(float %x) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc @@ -1874,8 +1873,7 @@ define float @v_sqrt_f32_ulp25(float %x) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc @@ -1911,8 +1909,7 @@ define float @v_sqrt_f32_ulp3(float %x) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc @@ -1947,8 +1944,7 @@ define float @v_sqrt_f32_ulp2_fabs(float %x) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, s[4:5] ; SDAG-IEEE-NEXT: v_ldexp_f32_e64 v0, |v0|, v1 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, s[4:5] @@ -2074,12 +2070,10 @@ define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, s[4:5] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 @@ -2218,12 +2212,10 @@ define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s6, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, s[4:5] ; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[6:7], |v1|, s6 ; SDAG-IEEE-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[6:7] -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, s[6:7] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_ldexp_f32_e64 v1, |v1|, v2 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 @@ -2315,8 +2307,7 @@ define float @v_sqrt_f32_ulp2_noncontractable_rcp(float %x) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc @@ -2404,8 +2395,7 @@ define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc @@ -2489,8 +2479,7 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc @@ -2574,8 +2563,7 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv_arcp(float %x, float %y) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc @@ -2631,12 +2619,10 @@ define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 32, s[4:5] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 @@ -2763,12 +2749,10 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 32, vcc ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v4 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 32, s[4:5] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v4 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 @@ -2900,12 +2884,10 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2 ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 32, vcc ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v4 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 32, s[4:5] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v4 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 @@ -3026,8 +3008,7 @@ define float @v_sqrt_f32_known_never_posdenormal_ulp2(float nofpclass(psub) %x) ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc @@ -3062,8 +3043,7 @@ define float @v_sqrt_f32_nsz_known_never_posdenormal_ulp2(float nofpclass(psub) ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc @@ -3098,8 +3078,7 @@ define float @v_sqrt_f32_known_never_negdenormal(float nofpclass(nsub) %x) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc @@ -3698,8 +3677,7 @@ define float @v_sqrt_f32_known_never_zero_never_ninf_ulp2(float nofpclass(zero n ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc @@ -3734,8 +3712,7 @@ define float @v_sqrt_f32_known_never_ninf_ulp2(float nofpclass(ninf) %x) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc @@ -3770,8 +3747,7 @@ define float @v_sqrt_f32_nsz_known_never_ninf_ulp2(float nofpclass(ninf) %x) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc @@ -3910,8 +3886,7 @@ define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) { ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SDAG-IEEE-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v0, v1 ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, -16, vcc diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll index 34ee90c68569f..42f098522b9ad 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -239,10 +239,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) { ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_bfrev_b32_e32 v1, 8 ; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec ; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 ; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -308,10 +308,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) { ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_bfrev_b32_e32 v1, 8 ; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec ; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 ; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -377,10 +377,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) { ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_bfrev_b32_e32 v1, 8 ; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec ; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 ; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -446,10 +446,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) { ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_bfrev_b32_e32 v1, 8 ; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; SDAG-NEXT: s_and_b64 s[0:1], vcc, exec ; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 ; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -677,11 +677,10 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) { ; SDAG-NEXT: s_brev_b32 s5, 8 ; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] ; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_mov_b32_e32 v4, 0x100 +; SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 ; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -703,8 +702,8 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) { ; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] ; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] ; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 @@ -874,11 +873,10 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) { ; SDAG-NEXT: s_brev_b32 s5, 8 ; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] ; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_mov_b32_e32 v4, 0x100 +; SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 ; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -900,8 +898,8 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) { ; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] ; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] ; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 @@ -1102,11 +1100,10 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) { ; SDAG-NEXT: s_brev_b32 s5, 8 ; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] ; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SDAG-NEXT: v_mov_b32_e32 v4, 0x100 +; SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5 ; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] @@ -1128,8 +1125,8 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) { ; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3] ; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] ; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 @@ -1201,16 +1198,14 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { ; SDAG-NEXT: s_mov_b32 s6, 0 ; SDAG-NEXT: s_brev_b32 s7, 8 ; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v10, 0x100 ; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3] ; SDAG-NEXT: v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc ; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 -; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5] ; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] -; SDAG-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[6:7] ; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 ; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll index 6cd06c29b7edd..9f9c9439d2c28 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll @@ -2059,9 +2059,9 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s29, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 vcc, -1 +; GFX9-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, vcc +; GFX9-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-NEXT: v_writelane_b32 v40, s29, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll index 22257d3eba7d6..512d58d3f996d 100644 --- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll @@ -202,18 +202,18 @@ define void @indirect_use_50_vgpr() #0 { } ; GCN-LABEL: {{^}}use_80_sgpr: -; GCN: .set use_80_sgpr.num_vgpr, 1 +; GCN: .set use_80_sgpr.num_vgpr, 0 ; GCN: .set use_80_sgpr.num_agpr, 0 ; GCN: .set use_80_sgpr.numbered_sgpr, 80 -; GCN: .set use_80_sgpr.private_seg_size, 8 +; GCN: .set use_80_sgpr.private_seg_size, 0 ; GCN: .set use_80_sgpr.uses_vcc, 0 ; GCN: .set use_80_sgpr.uses_flat_scratch, 0 ; GCN: .set use_80_sgpr.has_dyn_sized_stack, 0 ; GCN: .set use_80_sgpr.has_recursion, 0 ; GCN: .set use_80_sgpr.has_indirect_call, 0 ; GCN: TotalNumSgprs: 84 -; GCN: NumVgprs: 1 -; GCN: ScratchSize: 8 +; GCN: NumVgprs: 0 +; GCN: ScratchSize: 0 define void @use_80_sgpr() #1 { call void asm sideeffect "", "~{s79}"() #0 ret void @@ -231,7 +231,7 @@ define void @use_80_sgpr() #1 { ; GCN: .set indirect_use_80_sgpr.has_indirect_call, or(0, use_80_sgpr.has_indirect_call) ; GCN: TotalNumSgprs: 84 ; GCN: NumVgprs: 41 -; GCN: ScratchSize: 24 +; GCN: ScratchSize: 16 define void @indirect_use_80_sgpr() #1 { call void @use_80_sgpr() ret void @@ -249,7 +249,7 @@ define void @indirect_use_80_sgpr() #1 { ; GCN: .set indirect_2_level_use_80_sgpr.has_indirect_call, or(0, indirect_use_80_sgpr.has_indirect_call) ; GCN: TotalNumSgprs: 86 ; GCN: NumVgprs: 41 -; GCN: ScratchSize: 24 +; GCN: ScratchSize: 16 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 { call void @indirect_use_80_sgpr() ret void @@ -336,14 +336,14 @@ define amdgpu_kernel void @indirect_2_level_use_stack() #0 { ; GCN-LABEL: {{^}}multi_call_use_use_stack: ; GCN: .set multi_call_use_use_stack.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr) ; GCN: .set multi_call_use_use_stack.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr) -; GCN: .set multi_call_use_use_stack.numbered_sgpr, max(44, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr) +; GCN: .set multi_call_use_use_stack.numbered_sgpr, max(52, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr) ; GCN: .set multi_call_use_use_stack.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) ; GCN: .set multi_call_use_use_stack.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc) ; GCN: .set multi_call_use_use_stack.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch) ; GCN: .set multi_call_use_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack) ; GCN: .set multi_call_use_use_stack.has_recursion, or(0, use_stack0.has_recursion, use_stack1.has_recursion) ; GCN: .set multi_call_use_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call) -; GCN: TotalNumSgprs: 50 +; GCN: TotalNumSgprs: 58 ; GCN: NumVgprs: 41 ; GCN: ScratchSize: 2052 define amdgpu_kernel void @multi_call_use_use_stack() #0 { @@ -357,7 +357,7 @@ declare void @external() #0 ; GCN-LABEL: {{^}}multi_call_with_external: ; GCN: .set multi_call_with_external.num_vgpr, max(41, amdgpu.max_num_vgpr) ; GCN: .set multi_call_with_external.num_agpr, max(0, amdgpu.max_num_agpr) -; GCN: .set multi_call_with_external.numbered_sgpr, max(44, amdgpu.max_num_sgpr) +; GCN: .set multi_call_with_external.numbered_sgpr, max(52, amdgpu.max_num_sgpr) ; GCN: .set multi_call_with_external.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) ; GCN: .set multi_call_with_external.uses_vcc, 1 ; GCN: .set multi_call_with_external.uses_flat_scratch, 1 @@ -377,7 +377,7 @@ define amdgpu_kernel void @multi_call_with_external() #0 { ; GCN-LABEL: {{^}}multi_call_with_external_and_duplicates: ; GCN: .set multi_call_with_external_and_duplicates.num_vgpr, max(41, amdgpu.max_num_vgpr) ; GCN: .set multi_call_with_external_and_duplicates.num_agpr, max(0, amdgpu.max_num_agpr) -; GCN: .set multi_call_with_external_and_duplicates.numbered_sgpr, max(46, amdgpu.max_num_sgpr) +; GCN: .set multi_call_with_external_and_duplicates.numbered_sgpr, max(54, amdgpu.max_num_sgpr) ; GCN: .set multi_call_with_external_and_duplicates.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) ; GCN: .set multi_call_with_external_and_duplicates.uses_vcc, 1 ; GCN: .set multi_call_with_external_and_duplicates.uses_flat_scratch, 1 @@ -594,7 +594,7 @@ define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 { ; GCN-LABEL: {{^}}multi_call_with_multi_stage_recurse: ; GCN: .set multi_call_with_multi_stage_recurse.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr, multi_stage_recurse1.num_vgpr) ; GCN: .set multi_call_with_multi_stage_recurse.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr, multi_stage_recurse1.num_agpr) -; GCN: .set multi_call_with_multi_stage_recurse.numbered_sgpr, max(45, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr, multi_stage_recurse1.numbered_sgpr) +; GCN: .set multi_call_with_multi_stage_recurse.numbered_sgpr, max(53, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr, multi_stage_recurse1.numbered_sgpr) ; GCN: .set multi_call_with_multi_stage_recurse.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size, multi_stage_recurse1.private_seg_size)) ; GCN: .set multi_call_with_multi_stage_recurse.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc, multi_stage_recurse1.uses_vcc) ; GCN: .set multi_call_with_multi_stage_recurse.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch, multi_stage_recurse1.uses_flat_scratch) diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll index afc315c318f74..8a60b32f2f1f8 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -40,14 +40,46 @@ define amdgpu_gfx void @gfx_func() { ; SDAG-NEXT: v_writelane_b32 v40, s27, 23 ; SDAG-NEXT: v_writelane_b32 v40, s28, 24 ; SDAG-NEXT: v_writelane_b32 v40, s29, 25 -; SDAG-NEXT: v_writelane_b32 v40, s30, 26 -; SDAG-NEXT: v_writelane_b32 v40, s31, 27 +; SDAG-NEXT: v_writelane_b32 v40, s72, 26 +; SDAG-NEXT: v_writelane_b32 v40, s73, 27 +; SDAG-NEXT: v_writelane_b32 v40, s74, 28 +; SDAG-NEXT: v_writelane_b32 v40, s75, 29 +; SDAG-NEXT: v_writelane_b32 v40, s76, 30 +; SDAG-NEXT: v_writelane_b32 v40, s77, 31 +; SDAG-NEXT: v_writelane_b32 v40, s78, 32 +; SDAG-NEXT: v_writelane_b32 v40, s79, 33 +; SDAG-NEXT: v_writelane_b32 v40, s88, 34 +; SDAG-NEXT: v_writelane_b32 v40, s89, 35 +; SDAG-NEXT: v_writelane_b32 v40, s90, 36 +; SDAG-NEXT: v_writelane_b32 v40, s91, 37 +; SDAG-NEXT: v_writelane_b32 v40, s92, 38 +; SDAG-NEXT: v_writelane_b32 v40, s93, 39 +; SDAG-NEXT: v_writelane_b32 v40, s94, 40 +; SDAG-NEXT: v_writelane_b32 v40, s95, 41 +; SDAG-NEXT: v_writelane_b32 v40, s30, 42 +; SDAG-NEXT: v_writelane_b32 v40, s31, 43 ; SDAG-NEXT: s_mov_b32 s35, extern_c_func@abs32@hi ; SDAG-NEXT: s_mov_b32 s34, extern_c_func@abs32@lo ; SDAG-NEXT: s_mov_b64 s[8:9], 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[34:35] -; SDAG-NEXT: v_readlane_b32 s30, v40, 26 -; SDAG-NEXT: v_readlane_b32 s31, v40, 27 +; SDAG-NEXT: v_readlane_b32 s30, v40, 42 +; SDAG-NEXT: v_readlane_b32 s31, v40, 43 +; SDAG-NEXT: v_readlane_b32 s95, v40, 41 +; SDAG-NEXT: v_readlane_b32 s94, v40, 40 +; SDAG-NEXT: v_readlane_b32 s93, v40, 39 +; SDAG-NEXT: v_readlane_b32 s92, v40, 38 +; SDAG-NEXT: v_readlane_b32 s91, v40, 37 +; SDAG-NEXT: v_readlane_b32 s90, v40, 36 +; SDAG-NEXT: v_readlane_b32 s89, v40, 35 +; SDAG-NEXT: v_readlane_b32 s88, v40, 34 +; SDAG-NEXT: v_readlane_b32 s79, v40, 33 +; SDAG-NEXT: v_readlane_b32 s78, v40, 32 +; SDAG-NEXT: v_readlane_b32 s77, v40, 31 +; SDAG-NEXT: v_readlane_b32 s76, v40, 30 +; SDAG-NEXT: v_readlane_b32 s75, v40, 29 +; SDAG-NEXT: v_readlane_b32 s74, v40, 28 +; SDAG-NEXT: v_readlane_b32 s73, v40, 27 +; SDAG-NEXT: v_readlane_b32 s72, v40, 26 ; SDAG-NEXT: v_readlane_b32 s29, v40, 25 ; SDAG-NEXT: v_readlane_b32 s28, v40, 24 ; SDAG-NEXT: v_readlane_b32 s27, v40, 23 @@ -117,14 +149,46 @@ define amdgpu_gfx void @gfx_func() { ; GISEL-NEXT: v_writelane_b32 v40, s27, 23 ; GISEL-NEXT: v_writelane_b32 v40, s28, 24 ; GISEL-NEXT: v_writelane_b32 v40, s29, 25 -; GISEL-NEXT: v_writelane_b32 v40, s30, 26 -; GISEL-NEXT: v_writelane_b32 v40, s31, 27 +; GISEL-NEXT: v_writelane_b32 v40, s72, 26 +; GISEL-NEXT: v_writelane_b32 v40, s73, 27 +; GISEL-NEXT: v_writelane_b32 v40, s74, 28 +; GISEL-NEXT: v_writelane_b32 v40, s75, 29 +; GISEL-NEXT: v_writelane_b32 v40, s76, 30 +; GISEL-NEXT: v_writelane_b32 v40, s77, 31 +; GISEL-NEXT: v_writelane_b32 v40, s78, 32 +; GISEL-NEXT: v_writelane_b32 v40, s79, 33 +; GISEL-NEXT: v_writelane_b32 v40, s88, 34 +; GISEL-NEXT: v_writelane_b32 v40, s89, 35 +; GISEL-NEXT: v_writelane_b32 v40, s90, 36 +; GISEL-NEXT: v_writelane_b32 v40, s91, 37 +; GISEL-NEXT: v_writelane_b32 v40, s92, 38 +; GISEL-NEXT: v_writelane_b32 v40, s93, 39 +; GISEL-NEXT: v_writelane_b32 v40, s94, 40 +; GISEL-NEXT: v_writelane_b32 v40, s95, 41 +; GISEL-NEXT: v_writelane_b32 v40, s30, 42 +; GISEL-NEXT: v_writelane_b32 v40, s31, 43 ; GISEL-NEXT: s_mov_b32 s34, extern_c_func@abs32@lo ; GISEL-NEXT: s_mov_b32 s35, extern_c_func@abs32@hi ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GISEL-NEXT: v_readlane_b32 s30, v40, 26 -; GISEL-NEXT: v_readlane_b32 s31, v40, 27 +; GISEL-NEXT: v_readlane_b32 s30, v40, 42 +; GISEL-NEXT: v_readlane_b32 s31, v40, 43 +; GISEL-NEXT: v_readlane_b32 s95, v40, 41 +; GISEL-NEXT: v_readlane_b32 s94, v40, 40 +; GISEL-NEXT: v_readlane_b32 s93, v40, 39 +; GISEL-NEXT: v_readlane_b32 s92, v40, 38 +; GISEL-NEXT: v_readlane_b32 s91, v40, 37 +; GISEL-NEXT: v_readlane_b32 s90, v40, 36 +; GISEL-NEXT: v_readlane_b32 s89, v40, 35 +; GISEL-NEXT: v_readlane_b32 s88, v40, 34 +; GISEL-NEXT: v_readlane_b32 s79, v40, 33 +; GISEL-NEXT: v_readlane_b32 s78, v40, 32 +; GISEL-NEXT: v_readlane_b32 s77, v40, 31 +; GISEL-NEXT: v_readlane_b32 s76, v40, 30 +; GISEL-NEXT: v_readlane_b32 s75, v40, 29 +; GISEL-NEXT: v_readlane_b32 s74, v40, 28 +; GISEL-NEXT: v_readlane_b32 s73, v40, 27 +; GISEL-NEXT: v_readlane_b32 s72, v40, 26 ; GISEL-NEXT: v_readlane_b32 s29, v40, 25 ; GISEL-NEXT: v_readlane_b32 s28, v40, 24 ; GISEL-NEXT: v_readlane_b32 s27, v40, 23 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 6e87d39a6a7c0..bbaee4b50e383 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -9108,32 +9108,16 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s37, 3 ; GFX9-NEXT: v_writelane_b32 v40, s38, 4 ; GFX9-NEXT: v_writelane_b32 v40, s39, 5 -; GFX9-NEXT: v_writelane_b32 v40, s40, 6 -; GFX9-NEXT: v_writelane_b32 v40, s41, 7 -; GFX9-NEXT: v_writelane_b32 v40, s42, 8 -; GFX9-NEXT: v_writelane_b32 v40, s43, 9 -; GFX9-NEXT: v_writelane_b32 v40, s44, 10 -; GFX9-NEXT: v_writelane_b32 v40, s45, 11 -; GFX9-NEXT: v_writelane_b32 v40, s46, 12 -; GFX9-NEXT: v_writelane_b32 v40, s47, 13 -; GFX9-NEXT: v_writelane_b32 v40, s48, 14 -; GFX9-NEXT: v_writelane_b32 v40, s49, 15 -; GFX9-NEXT: v_writelane_b32 v40, s50, 16 -; GFX9-NEXT: v_writelane_b32 v40, s51, 17 -; GFX9-NEXT: v_writelane_b32 v40, s52, 18 -; GFX9-NEXT: v_writelane_b32 v40, s53, 19 -; GFX9-NEXT: v_writelane_b32 v40, s54, 20 -; GFX9-NEXT: v_writelane_b32 v40, s55, 21 -; GFX9-NEXT: v_writelane_b32 v40, s56, 22 -; GFX9-NEXT: v_writelane_b32 v40, s57, 23 -; GFX9-NEXT: v_writelane_b32 v40, s58, 24 -; GFX9-NEXT: v_writelane_b32 v40, s59, 25 -; GFX9-NEXT: v_writelane_b32 v40, s60, 26 -; GFX9-NEXT: v_writelane_b32 v40, s61, 27 -; GFX9-NEXT: v_writelane_b32 v40, s62, 28 -; GFX9-NEXT: v_writelane_b32 v40, s63, 29 -; GFX9-NEXT: v_writelane_b32 v40, s30, 30 -; GFX9-NEXT: v_writelane_b32 v40, s31, 31 +; GFX9-NEXT: v_writelane_b32 v40, s48, 6 +; GFX9-NEXT: v_writelane_b32 v40, s49, 7 +; GFX9-NEXT: v_writelane_b32 v40, s50, 8 +; GFX9-NEXT: v_writelane_b32 v40, s51, 9 +; GFX9-NEXT: v_writelane_b32 v40, s52, 10 +; GFX9-NEXT: v_writelane_b32 v40, s53, 11 +; GFX9-NEXT: v_writelane_b32 v40, s54, 12 +; GFX9-NEXT: v_writelane_b32 v40, s55, 13 +; GFX9-NEXT: v_writelane_b32 v40, s30, 14 +; GFX9-NEXT: v_writelane_b32 v40, s31, 15 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 @@ -9144,32 +9128,16 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s30, v40, 30 -; GFX9-NEXT: v_readlane_b32 s31, v40, 31 -; GFX9-NEXT: v_readlane_b32 s63, v40, 29 -; GFX9-NEXT: v_readlane_b32 s62, v40, 28 -; GFX9-NEXT: v_readlane_b32 s61, v40, 27 -; GFX9-NEXT: v_readlane_b32 s60, v40, 26 -; GFX9-NEXT: v_readlane_b32 s59, v40, 25 -; GFX9-NEXT: v_readlane_b32 s58, v40, 24 -; GFX9-NEXT: v_readlane_b32 s57, v40, 23 -; GFX9-NEXT: v_readlane_b32 s56, v40, 22 -; GFX9-NEXT: v_readlane_b32 s55, v40, 21 -; GFX9-NEXT: v_readlane_b32 s54, v40, 20 -; GFX9-NEXT: v_readlane_b32 s53, v40, 19 -; GFX9-NEXT: v_readlane_b32 s52, v40, 18 -; GFX9-NEXT: v_readlane_b32 s51, v40, 17 -; GFX9-NEXT: v_readlane_b32 s50, v40, 16 -; GFX9-NEXT: v_readlane_b32 s49, v40, 15 -; GFX9-NEXT: v_readlane_b32 s48, v40, 14 -; GFX9-NEXT: v_readlane_b32 s47, v40, 13 -; GFX9-NEXT: v_readlane_b32 s46, v40, 12 -; GFX9-NEXT: v_readlane_b32 s45, v40, 11 -; GFX9-NEXT: v_readlane_b32 s44, v40, 10 -; GFX9-NEXT: v_readlane_b32 s43, v40, 9 -; GFX9-NEXT: v_readlane_b32 s42, v40, 8 -; GFX9-NEXT: v_readlane_b32 s41, v40, 7 -; GFX9-NEXT: v_readlane_b32 s40, v40, 6 +; GFX9-NEXT: v_readlane_b32 s30, v40, 14 +; GFX9-NEXT: v_readlane_b32 s31, v40, 15 +; GFX9-NEXT: v_readlane_b32 s55, v40, 13 +; GFX9-NEXT: v_readlane_b32 s54, v40, 12 +; GFX9-NEXT: v_readlane_b32 s53, v40, 11 +; GFX9-NEXT: v_readlane_b32 s52, v40, 10 +; GFX9-NEXT: v_readlane_b32 s51, v40, 9 +; GFX9-NEXT: v_readlane_b32 s50, v40, 8 +; GFX9-NEXT: v_readlane_b32 s49, v40, 7 +; GFX9-NEXT: v_readlane_b32 s48, v40, 6 ; GFX9-NEXT: v_readlane_b32 s39, v40, 5 ; GFX9-NEXT: v_readlane_b32 s38, v40, 4 ; GFX9-NEXT: v_readlane_b32 s37, v40, 3 @@ -9200,32 +9168,16 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s37, 3 ; GFX10-NEXT: v_writelane_b32 v40, s38, 4 ; GFX10-NEXT: v_writelane_b32 v40, s39, 5 -; GFX10-NEXT: v_writelane_b32 v40, s40, 6 -; GFX10-NEXT: v_writelane_b32 v40, s41, 7 -; GFX10-NEXT: v_writelane_b32 v40, s42, 8 -; GFX10-NEXT: v_writelane_b32 v40, s43, 9 -; GFX10-NEXT: v_writelane_b32 v40, s44, 10 -; GFX10-NEXT: v_writelane_b32 v40, s45, 11 -; GFX10-NEXT: v_writelane_b32 v40, s46, 12 -; GFX10-NEXT: v_writelane_b32 v40, s47, 13 -; GFX10-NEXT: v_writelane_b32 v40, s48, 14 -; GFX10-NEXT: v_writelane_b32 v40, s49, 15 -; GFX10-NEXT: v_writelane_b32 v40, s50, 16 -; GFX10-NEXT: v_writelane_b32 v40, s51, 17 -; GFX10-NEXT: v_writelane_b32 v40, s52, 18 -; GFX10-NEXT: v_writelane_b32 v40, s53, 19 -; GFX10-NEXT: v_writelane_b32 v40, s54, 20 -; GFX10-NEXT: v_writelane_b32 v40, s55, 21 -; GFX10-NEXT: v_writelane_b32 v40, s56, 22 -; GFX10-NEXT: v_writelane_b32 v40, s57, 23 -; GFX10-NEXT: v_writelane_b32 v40, s58, 24 -; GFX10-NEXT: v_writelane_b32 v40, s59, 25 -; GFX10-NEXT: v_writelane_b32 v40, s60, 26 -; GFX10-NEXT: v_writelane_b32 v40, s61, 27 -; GFX10-NEXT: v_writelane_b32 v40, s62, 28 -; GFX10-NEXT: v_writelane_b32 v40, s63, 29 -; GFX10-NEXT: v_writelane_b32 v40, s30, 30 -; GFX10-NEXT: v_writelane_b32 v40, s31, 31 +; GFX10-NEXT: v_writelane_b32 v40, s48, 6 +; GFX10-NEXT: v_writelane_b32 v40, s49, 7 +; GFX10-NEXT: v_writelane_b32 v40, s50, 8 +; GFX10-NEXT: v_writelane_b32 v40, s51, 9 +; GFX10-NEXT: v_writelane_b32 v40, s52, 10 +; GFX10-NEXT: v_writelane_b32 v40, s53, 11 +; GFX10-NEXT: v_writelane_b32 v40, s54, 12 +; GFX10-NEXT: v_writelane_b32 v40, s55, 13 +; GFX10-NEXT: v_writelane_b32 v40, s30, 14 +; GFX10-NEXT: v_writelane_b32 v40, s31, 15 ; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20 @@ -9237,32 +9189,16 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s30, v40, 30 -; GFX10-NEXT: v_readlane_b32 s31, v40, 31 -; GFX10-NEXT: v_readlane_b32 s63, v40, 29 -; GFX10-NEXT: v_readlane_b32 s62, v40, 28 -; GFX10-NEXT: v_readlane_b32 s61, v40, 27 -; GFX10-NEXT: v_readlane_b32 s60, v40, 26 -; GFX10-NEXT: v_readlane_b32 s59, v40, 25 -; GFX10-NEXT: v_readlane_b32 s58, v40, 24 -; GFX10-NEXT: v_readlane_b32 s57, v40, 23 -; GFX10-NEXT: v_readlane_b32 s56, v40, 22 -; GFX10-NEXT: v_readlane_b32 s55, v40, 21 -; GFX10-NEXT: v_readlane_b32 s54, v40, 20 -; GFX10-NEXT: v_readlane_b32 s53, v40, 19 -; GFX10-NEXT: v_readlane_b32 s52, v40, 18 -; GFX10-NEXT: v_readlane_b32 s51, v40, 17 -; GFX10-NEXT: v_readlane_b32 s50, v40, 16 -; GFX10-NEXT: v_readlane_b32 s49, v40, 15 -; GFX10-NEXT: v_readlane_b32 s48, v40, 14 -; GFX10-NEXT: v_readlane_b32 s47, v40, 13 -; GFX10-NEXT: v_readlane_b32 s46, v40, 12 -; GFX10-NEXT: v_readlane_b32 s45, v40, 11 -; GFX10-NEXT: v_readlane_b32 s44, v40, 10 -; GFX10-NEXT: v_readlane_b32 s43, v40, 9 -; GFX10-NEXT: v_readlane_b32 s42, v40, 8 -; GFX10-NEXT: v_readlane_b32 s41, v40, 7 -; GFX10-NEXT: v_readlane_b32 s40, v40, 6 +; GFX10-NEXT: v_readlane_b32 s30, v40, 14 +; GFX10-NEXT: v_readlane_b32 s31, v40, 15 +; GFX10-NEXT: v_readlane_b32 s55, v40, 13 +; GFX10-NEXT: v_readlane_b32 s54, v40, 12 +; GFX10-NEXT: v_readlane_b32 s53, v40, 11 +; GFX10-NEXT: v_readlane_b32 s52, v40, 10 +; GFX10-NEXT: v_readlane_b32 s51, v40, 9 +; GFX10-NEXT: v_readlane_b32 s50, v40, 8 +; GFX10-NEXT: v_readlane_b32 s49, v40, 7 +; GFX10-NEXT: v_readlane_b32 s48, v40, 6 ; GFX10-NEXT: v_readlane_b32 s39, v40, 5 ; GFX10-NEXT: v_readlane_b32 s38, v40, 4 ; GFX10-NEXT: v_readlane_b32 s37, v40, 3 @@ -9293,32 +9229,16 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s37, 3 ; GFX11-NEXT: v_writelane_b32 v40, s38, 4 ; GFX11-NEXT: v_writelane_b32 v40, s39, 5 -; GFX11-NEXT: v_writelane_b32 v40, s40, 6 -; GFX11-NEXT: v_writelane_b32 v40, s41, 7 -; GFX11-NEXT: v_writelane_b32 v40, s42, 8 -; GFX11-NEXT: v_writelane_b32 v40, s43, 9 -; GFX11-NEXT: v_writelane_b32 v40, s44, 10 -; GFX11-NEXT: v_writelane_b32 v40, s45, 11 -; GFX11-NEXT: v_writelane_b32 v40, s46, 12 -; GFX11-NEXT: v_writelane_b32 v40, s47, 13 -; GFX11-NEXT: v_writelane_b32 v40, s48, 14 -; GFX11-NEXT: v_writelane_b32 v40, s49, 15 -; GFX11-NEXT: v_writelane_b32 v40, s50, 16 -; GFX11-NEXT: v_writelane_b32 v40, s51, 17 -; GFX11-NEXT: v_writelane_b32 v40, s52, 18 -; GFX11-NEXT: v_writelane_b32 v40, s53, 19 -; GFX11-NEXT: v_writelane_b32 v40, s54, 20 -; GFX11-NEXT: v_writelane_b32 v40, s55, 21 -; GFX11-NEXT: v_writelane_b32 v40, s56, 22 -; GFX11-NEXT: v_writelane_b32 v40, s57, 23 -; GFX11-NEXT: v_writelane_b32 v40, s58, 24 -; GFX11-NEXT: v_writelane_b32 v40, s59, 25 -; GFX11-NEXT: v_writelane_b32 v40, s60, 26 -; GFX11-NEXT: v_writelane_b32 v40, s61, 27 -; GFX11-NEXT: v_writelane_b32 v40, s62, 28 -; GFX11-NEXT: v_writelane_b32 v40, s63, 29 -; GFX11-NEXT: v_writelane_b32 v40, s30, 30 -; GFX11-NEXT: v_writelane_b32 v40, s31, 31 +; GFX11-NEXT: v_writelane_b32 v40, s48, 6 +; GFX11-NEXT: v_writelane_b32 v40, s49, 7 +; GFX11-NEXT: v_writelane_b32 v40, s50, 8 +; GFX11-NEXT: v_writelane_b32 v40, s51, 9 +; GFX11-NEXT: v_writelane_b32 v40, s52, 10 +; GFX11-NEXT: v_writelane_b32 v40, s53, 11 +; GFX11-NEXT: v_writelane_b32 v40, s54, 12 +; GFX11-NEXT: v_writelane_b32 v40, s55, 13 +; GFX11-NEXT: v_writelane_b32 v40, s30, 14 +; GFX11-NEXT: v_writelane_b32 v40, s31, 15 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 offset:16 ; GFX11-NEXT: scratch_load_b32 v31, off, s33 @@ -9327,32 +9247,16 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_readlane_b32 s30, v40, 30 -; GFX11-NEXT: v_readlane_b32 s31, v40, 31 -; GFX11-NEXT: v_readlane_b32 s63, v40, 29 -; GFX11-NEXT: v_readlane_b32 s62, v40, 28 -; GFX11-NEXT: v_readlane_b32 s61, v40, 27 -; GFX11-NEXT: v_readlane_b32 s60, v40, 26 -; GFX11-NEXT: v_readlane_b32 s59, v40, 25 -; GFX11-NEXT: v_readlane_b32 s58, v40, 24 -; GFX11-NEXT: v_readlane_b32 s57, v40, 23 -; GFX11-NEXT: v_readlane_b32 s56, v40, 22 -; GFX11-NEXT: v_readlane_b32 s55, v40, 21 -; GFX11-NEXT: v_readlane_b32 s54, v40, 20 -; GFX11-NEXT: v_readlane_b32 s53, v40, 19 -; GFX11-NEXT: v_readlane_b32 s52, v40, 18 -; GFX11-NEXT: v_readlane_b32 s51, v40, 17 -; GFX11-NEXT: v_readlane_b32 s50, v40, 16 -; GFX11-NEXT: v_readlane_b32 s49, v40, 15 -; GFX11-NEXT: v_readlane_b32 s48, v40, 14 -; GFX11-NEXT: v_readlane_b32 s47, v40, 13 -; GFX11-NEXT: v_readlane_b32 s46, v40, 12 -; GFX11-NEXT: v_readlane_b32 s45, v40, 11 -; GFX11-NEXT: v_readlane_b32 s44, v40, 10 -; GFX11-NEXT: v_readlane_b32 s43, v40, 9 -; GFX11-NEXT: v_readlane_b32 s42, v40, 8 -; GFX11-NEXT: v_readlane_b32 s41, v40, 7 -; GFX11-NEXT: v_readlane_b32 s40, v40, 6 +; GFX11-NEXT: v_readlane_b32 s30, v40, 14 +; GFX11-NEXT: v_readlane_b32 s31, v40, 15 +; GFX11-NEXT: v_readlane_b32 s55, v40, 13 +; GFX11-NEXT: v_readlane_b32 s54, v40, 12 +; GFX11-NEXT: v_readlane_b32 s53, v40, 11 +; GFX11-NEXT: v_readlane_b32 s52, v40, 10 +; GFX11-NEXT: v_readlane_b32 s51, v40, 9 +; GFX11-NEXT: v_readlane_b32 s50, v40, 8 +; GFX11-NEXT: v_readlane_b32 s49, v40, 7 +; GFX11-NEXT: v_readlane_b32 s48, v40, 6 ; GFX11-NEXT: v_readlane_b32 s39, v40, 5 ; GFX11-NEXT: v_readlane_b32 s38, v40, 4 ; GFX11-NEXT: v_readlane_b32 s37, v40, 3 @@ -9383,32 +9287,16 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s40, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s41, 7 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s42, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s43, 9 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s44, 10 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s45, 11 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s46, 12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s47, 13 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s48, 14 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 15 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 17 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s52, 18 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s53, 19 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s54, 20 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s55, 21 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s56, 22 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s57, 23 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s58, 24 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s59, 25 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s60, 26 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s61, 27 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s62, 28 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s63, 29 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 30 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 31 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s48, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 9 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s52, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s53, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s54, 12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s55, 13 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 14 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 15 ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:16 ; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33 @@ -9417,32 +9305,16 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1) ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 30 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 31 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s63, v40, 29 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s62, v40, 28 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s61, v40, 27 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s60, v40, 26 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s59, v40, 25 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s58, v40, 24 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s57, v40, 23 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s56, v40, 22 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s55, v40, 21 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s54, v40, 20 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s53, v40, 19 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s52, v40, 18 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s51, v40, 17 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s50, v40, 16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s49, v40, 15 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s48, v40, 14 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s47, v40, 13 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s46, v40, 12 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s45, v40, 11 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s44, v40, 10 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s43, v40, 9 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s42, v40, 8 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s41, v40, 7 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s40, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 14 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 15 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s55, v40, 13 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s54, v40, 12 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s53, v40, 11 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s52, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s51, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s50, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s49, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s48, v40, 6 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s39, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s38, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s37, v40, 3 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll index 8a7762fb4b6c7..a0bbc0229a239 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -74,7 +74,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] + ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] ; GFX908-NEXT: early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX908-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -136,7 +136,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec ; GFX90A_GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] + ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] ; GFX90A_GFX940-NEXT: early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index 1fb34abb41a2d..47681ad9e5168 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -77,7 +77,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] ; GFX90A-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] @@ -105,7 +105,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 ; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX90A-NEXT: early-clobber %46:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] @@ -159,7 +159,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec ; GFX940-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] ; GFX940-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] @@ -187,7 +187,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 ; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX940-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec ; GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] @@ -239,11 +239,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 15 - ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_2]] + ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_2]] ; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed [[V_READLANE_B32_]], killed [[S_MOV_B32_3]], [[V_MOV_B32_dpp5]] ; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 31 - ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_4]] + ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_4]] ; GFX11-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] @@ -271,7 +271,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF3]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX11-NEXT: early-clobber %47:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %47, 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY6]] diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 2be6bf302d35f..73b4428b03c81 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -6936,7 +6936,6 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: @@ -7158,7 +7157,6 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7381,7 +7379,6 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7610,7 +7607,6 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: @@ -7814,7 +7810,6 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8021,7 +8016,6 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8256,7 +8250,6 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -8561,6 +8554,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8596,7 +8590,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8912,6 +8905,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8947,7 +8941,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9296,7 +9289,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: @@ -9591,6 +9583,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -9624,7 +9617,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9929,6 +9921,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -9962,7 +9955,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10291,7 +10283,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10551,7 +10542,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10782,6 +10772,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -10818,7 +10809,6 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11136,6 +11126,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -11170,7 +11161,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11502,8 +11492,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11523,7 +11514,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -11873,6 +11863,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11896,8 +11887,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11917,7 +11909,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12279,6 +12270,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -12302,8 +12294,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -12323,7 +12316,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12707,6 +12699,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -12726,7 +12719,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: @@ -13066,6 +13058,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -13089,6 +13082,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -13108,7 +13102,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13459,6 +13452,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -13482,6 +13476,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -13501,7 +13496,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13867,6 +13861,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -13885,7 +13880,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -14188,8 +14182,9 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -14207,7 +14202,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -14491,6 +14485,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -14514,8 +14509,9 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -14536,7 +14532,6 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14900,6 +14895,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -14923,6 +14919,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -14943,7 +14940,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB63_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -23072,7 +23068,6 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX12-NEXT: s_cbranch_execz .LBB92_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index e2fde562d36b1..cd6ed1e6b98c2 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -3013,7 +3013,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -3186,7 +3185,6 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3360,7 +3358,6 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3533,7 +3530,6 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -3696,7 +3692,6 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3862,7 +3857,6 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -4029,7 +4023,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -4278,7 +4271,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -4469,7 +4461,6 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -4786,6 +4777,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -4823,7 +4815,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5151,6 +5142,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -5188,7 +5180,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5552,7 +5543,6 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: @@ -5860,6 +5850,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -5896,7 +5887,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6214,6 +6204,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -6250,7 +6241,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6594,7 +6584,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6869,7 +6858,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7113,6 +7101,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7151,7 +7140,6 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7481,6 +7469,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7518,7 +7507,6 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7863,8 +7851,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -7884,7 +7873,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -8236,6 +8224,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8259,8 +8248,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8280,7 +8270,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8644,6 +8633,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8667,8 +8657,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8688,7 +8679,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9074,6 +9064,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -9093,7 +9084,6 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9435,6 +9425,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9458,6 +9449,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -9477,7 +9469,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9830,6 +9821,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9853,6 +9845,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -9872,7 +9865,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10240,6 +10232,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -10258,7 +10251,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10563,8 +10555,9 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10582,7 +10575,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10868,6 +10860,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10891,8 +10884,9 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -10913,7 +10907,6 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11279,6 +11272,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11302,6 +11296,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11322,7 +11317,6 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11703,7 +11697,6 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -11996,7 +11989,6 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12291,7 +12283,6 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12589,7 +12580,6 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12871,7 +12861,6 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13156,7 +13145,6 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13451,7 +13439,6 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13748,7 +13735,6 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14039,9 +14025,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14058,7 +14045,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -14438,9 +14424,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14457,7 +14444,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14839,9 +14825,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14858,7 +14845,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15243,8 +15229,10 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15261,7 +15249,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15629,8 +15616,10 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15647,7 +15636,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16018,8 +16006,10 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16036,7 +16026,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16416,9 +16405,10 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -16436,7 +16426,6 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16819,8 +16808,10 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16838,7 +16829,6 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 903e80b15814f..b49047c54d7dd 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -3013,7 +3013,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -3186,7 +3185,6 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3360,7 +3358,6 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3533,7 +3530,6 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -3696,7 +3692,6 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -3862,7 +3857,6 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -4029,7 +4023,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -4278,7 +4271,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -4469,7 +4461,6 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -4786,6 +4777,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -4823,7 +4815,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5151,6 +5142,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -5188,7 +5180,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5552,7 +5543,6 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: @@ -5860,6 +5850,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -5896,7 +5887,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6214,6 +6204,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -6250,7 +6241,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6594,7 +6584,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -6869,7 +6858,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -7113,6 +7101,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7151,7 +7140,6 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7481,6 +7469,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7518,7 +7507,6 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -7863,8 +7851,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -7884,7 +7873,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -8236,6 +8224,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8259,8 +8248,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8280,7 +8270,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8644,6 +8633,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8667,8 +8657,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8688,7 +8679,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -9074,6 +9064,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -9093,7 +9084,6 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: @@ -9435,6 +9425,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9458,6 +9449,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -9477,7 +9469,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9830,6 +9821,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9853,6 +9845,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -9872,7 +9865,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10240,6 +10232,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -10258,7 +10251,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -10563,8 +10555,9 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10582,7 +10575,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: @@ -10868,6 +10860,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10891,8 +10884,9 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -10913,7 +10907,6 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11279,6 +11272,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11302,6 +11296,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11322,7 +11317,6 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -11703,7 +11697,6 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -11996,7 +11989,6 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12291,7 +12283,6 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12589,7 +12580,6 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: @@ -12871,7 +12861,6 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13156,7 +13145,6 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13451,7 +13439,6 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13748,7 +13735,6 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14039,9 +14025,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14058,7 +14045,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -14438,9 +14424,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14457,7 +14444,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14839,9 +14825,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14858,7 +14845,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15243,8 +15229,10 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15261,7 +15249,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: @@ -15629,8 +15616,10 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15647,7 +15636,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16018,8 +16006,10 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16036,7 +16026,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -16416,9 +16405,10 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -16436,7 +16426,6 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16819,8 +16808,10 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16838,7 +16829,6 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 24791b60bfc6d..5577029f502d0 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -42,7 +42,6 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32: @@ -274,7 +273,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos: @@ -508,7 +506,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg: @@ -750,7 +747,6 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32: @@ -971,7 +967,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos: @@ -1195,7 +1190,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg: @@ -1430,7 +1424,6 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos: @@ -1665,7 +1658,6 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos: @@ -1897,7 +1889,6 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__ftz: @@ -2129,7 +2120,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -2363,7 +2353,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: @@ -2605,7 +2594,6 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__ftz: @@ -2826,7 +2814,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3050,7 +3037,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz: @@ -3285,7 +3271,6 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz: @@ -3520,7 +3505,6 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz: @@ -3752,7 +3736,6 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f64: @@ -4004,7 +3987,6 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos: @@ -4257,7 +4239,6 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -4516,7 +4497,6 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64: @@ -4746,7 +4726,6 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos: @@ -4979,7 +4958,6 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg: @@ -5240,7 +5218,6 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16: @@ -5545,6 +5522,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -5580,7 +5558,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -5896,6 +5873,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -5931,7 +5909,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -6280,7 +6257,6 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16: @@ -6575,6 +6551,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -6608,7 +6585,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: @@ -6913,6 +6889,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -6946,7 +6923,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: @@ -7275,7 +7251,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -7535,7 +7510,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: @@ -7766,6 +7740,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -7802,7 +7777,6 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: @@ -8120,6 +8094,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -8154,7 +8129,6 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: @@ -8486,8 +8460,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8507,7 +8482,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16: @@ -8857,6 +8831,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -8880,8 +8855,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -8901,7 +8877,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -9263,6 +9238,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -9286,8 +9262,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -9307,7 +9284,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -9691,6 +9667,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -9710,7 +9687,6 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16: @@ -10050,6 +10026,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10073,6 +10050,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10092,7 +10070,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: @@ -10443,6 +10420,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -10466,6 +10444,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -10485,7 +10464,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: @@ -10851,6 +10829,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -10869,7 +10848,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -11172,8 +11150,9 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11191,7 +11170,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: @@ -11475,6 +11453,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11498,8 +11477,9 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11520,7 +11500,6 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -11884,6 +11863,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -11907,6 +11887,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -11927,7 +11908,6 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: @@ -12304,7 +12284,6 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16: @@ -12582,7 +12561,6 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -12862,7 +12840,6 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -13144,7 +13121,6 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16: @@ -13409,7 +13385,6 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos: @@ -13677,7 +13652,6 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg: @@ -13956,7 +13930,6 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -14237,7 +14210,6 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos: @@ -14514,9 +14486,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14533,7 +14506,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16: @@ -14913,9 +14885,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -14932,7 +14905,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -15314,9 +15286,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -15333,7 +15306,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -15718,8 +15690,10 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -15736,7 +15710,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16: @@ -16104,8 +16077,10 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16122,7 +16097,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: @@ -16493,8 +16467,10 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -16511,7 +16487,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: @@ -16891,9 +16866,10 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo ; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 @@ -16911,7 +16887,6 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -17294,8 +17269,10 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 @@ -17313,7 +17290,6 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll index bfd57aebad521..a33d363e11bcf 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -537,6 +537,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -664,6 +665,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -779,6 +781,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -891,6 +894,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -1445,6 +1449,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -1572,6 +1577,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -1687,6 +1693,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -1799,6 +1806,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -2353,6 +2361,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -2480,6 +2489,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -2595,6 +2605,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -2707,6 +2718,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -3261,6 +3273,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -3388,6 +3401,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -3503,6 +3517,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS @@ -3615,6 +3630,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX12-NEXT: global_wb scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index b2f113f08a916..492a30b67089c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -1679,6 +1679,7 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 @@ -1745,6 +1746,7 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:42 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 @@ -1931,6 +1933,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i3 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32: @@ -1943,6 +1946,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i3 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset @@ -1986,6 +1990,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: @@ -1998,6 +2003,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:4095 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset @@ -2065,6 +2071,7 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1 ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_saddr_f32_natural_addressing: @@ -2081,6 +2088,7 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc ; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr %zext.offset = zext i32 %voffset to i64 @@ -2246,6 +2254,7 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addr ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: @@ -2262,6 +2271,7 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addr ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc ; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: ; return to shader part epilog %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !1, !noundef !{} %zext.offset = zext i32 %voffset to i64 @@ -4727,10 +4737,12 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc ; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4845,10 +4857,12 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc ; GFX12-GISEL-NEXT: global_load_b32 v6, v[4:5], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index fbe06b3651b06..4c6c7ebcc31c7 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -735,12 +735,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -760,8 +760,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -1955,12 +1955,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -1980,8 +1980,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -3235,12 +3235,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -3260,8 +3260,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -4011,12 +4011,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -4036,8 +4036,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -5316,12 +5316,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -5341,8 +5341,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -5747,14 +5747,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 @@ -5763,16 +5763,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 ; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v4 @@ -5785,11 +5785,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -5802,42 +5802,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 @@ -5845,21 +5845,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 ; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s42, s9 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s50, s9 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start @@ -5872,68 +5872,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v3, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v4, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b32 s33, s10 ; GFX1064-NEXT: s_mov_b64 s[10:11], exec -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s10, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s11, v3 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -5950,69 +5950,69 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_mov_b32 s9, exec_lo -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s9 ; GFX1032-NEXT: s_mov_b32 s33, s10 ; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6029,37 +6029,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm @@ -6070,7 +6070,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6079,16 +6079,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -6111,18 +6111,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6130,8 +6130,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-NEXT: .LBB9_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6142,24 +6142,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -6179,25 +6179,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-NEXT: .LBB9_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6206,14 +6206,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 @@ -6222,16 +6222,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v4, v0, v4 @@ -6244,11 +6244,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -6261,42 +6261,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-DPP-NEXT: .LBB9_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 @@ -6304,21 +6304,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start @@ -6331,68 +6331,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], exec -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s10, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s11, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6409,69 +6409,69 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_mov_b32 s9, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s9 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6488,37 +6488,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -6529,7 +6529,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6538,16 +6538,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6570,18 +6570,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6589,8 +6589,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6601,24 +6601,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6638,25 +6638,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6669,19 +6669,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -6692,15 +6692,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -6725,21 +6725,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -6752,44 +6752,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 ; GFX7LESS-NEXT: .LBB10_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -6801,14 +6801,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -6833,11 +6833,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB10_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6848,53 +6848,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB10_4 ; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -6906,14 +6906,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -6938,11 +6938,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB10_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -6952,55 +6952,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7012,14 +7012,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7038,16 +7038,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB10_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -7057,37 +7057,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm @@ -7095,11 +7095,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7107,11 +7107,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -7143,11 +7143,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB10_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start @@ -7164,18 +7164,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7183,8 +7183,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7193,7 +7193,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -7202,9 +7202,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -7233,17 +7233,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start @@ -7258,25 +7258,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7285,22 +7285,22 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7311,30 +7311,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -7347,44 +7347,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7396,14 +7396,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7450,74 +7450,74 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[44:45] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[52:53] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7529,14 +7529,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7581,10 +7581,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) @@ -7594,55 +7594,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7654,14 +7654,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7696,14 +7696,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -7713,37 +7713,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -7751,11 +7751,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7763,11 +7763,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -7825,10 +7825,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7845,18 +7845,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7864,8 +7864,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7874,7 +7874,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7883,9 +7883,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -7936,14 +7936,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7958,25 +7958,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8922,12 +8922,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -8947,8 +8947,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 @@ -10355,12 +10355,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -10380,8 +10380,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 @@ -11270,12 +11270,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -11295,8 +11295,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 @@ -11771,13 +11771,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 @@ -11791,15 +11791,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -11811,11 +11811,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -11828,40 +11828,40 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 @@ -11874,20 +11874,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start @@ -11900,48 +11900,48 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 @@ -11953,19 +11953,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s10 -; GFX1064-NEXT: s_mov_b32 s42, s9 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s50, s9 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -11978,53 +11978,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] @@ -12033,18 +12033,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s10 -; GFX1032-NEXT: s_mov_b32 s42, s9 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s50, s9 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -12057,44 +12057,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12115,16 +12115,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s10 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -12145,18 +12145,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12164,8 +12164,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-NEXT: .LBB16_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12173,12 +12173,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12193,15 +12193,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12219,25 +12219,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-NEXT: .LBB16_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12246,13 +12246,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -12266,15 +12266,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -12286,11 +12286,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -12303,40 +12303,40 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-DPP-NEXT: .LBB16_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -12349,20 +12349,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start @@ -12375,48 +12375,48 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 @@ -12428,19 +12428,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12453,53 +12453,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] @@ -12508,18 +12508,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12532,44 +12532,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12590,16 +12590,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12620,18 +12620,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12639,8 +12639,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12648,12 +12648,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12668,15 +12668,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12694,25 +12694,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12725,19 +12725,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -12748,15 +12748,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -12781,21 +12781,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB17_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -12808,44 +12808,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 ; GFX7LESS-NEXT: .LBB17_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -12857,14 +12857,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -12889,11 +12889,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB17_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -12904,53 +12904,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB17_4 ; GFX9-NEXT: .LBB17_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -12962,14 +12962,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -12994,11 +12994,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB17_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -13008,55 +13008,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13068,14 +13068,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13094,16 +13094,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB17_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -13113,37 +13113,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm @@ -13151,11 +13151,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13163,11 +13163,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -13199,11 +13199,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB17_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start @@ -13220,18 +13220,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13239,8 +13239,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13249,7 +13249,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -13258,9 +13258,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -13289,17 +13289,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB17_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start @@ -13314,25 +13314,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13341,22 +13341,22 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -13367,30 +13367,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -13403,44 +13403,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13452,14 +13452,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13506,74 +13506,74 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[44:45] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[52:53] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13585,14 +13585,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13637,10 +13637,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) @@ -13650,55 +13650,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13710,14 +13710,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13752,14 +13752,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -13769,37 +13769,37 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -13807,11 +13807,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13819,11 +13819,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -13881,10 +13881,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start @@ -13901,18 +13901,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13920,8 +13920,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13930,7 +13930,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -13939,9 +13939,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -13992,14 +13992,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start @@ -14014,25 +14014,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 7792422291998..6ff6a63ea3f1d 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -635,12 +635,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -660,8 +660,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 @@ -1674,12 +1674,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -1699,8 +1699,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 @@ -2713,12 +2713,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -2738,8 +2738,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 @@ -3135,13 +3135,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -3149,15 +3149,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -3169,8 +3169,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 @@ -3178,8 +3178,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -3188,59 +3188,59 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3253,36 +3253,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm @@ -3290,32 +3290,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s10 -; GFX1064-NEXT: s_mov_b32 s42, s9 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s50, s9 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -3328,70 +3328,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s10 -; GFX1032-NEXT: s_mov_b32 s42, s9 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s50, s9 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -3404,38 +3404,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm @@ -3444,7 +3444,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3452,16 +3452,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s10 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -3483,18 +3483,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3502,8 +3502,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-NEXT: .LBB6_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3513,22 +3513,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -3547,26 +3547,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-NEXT: .LBB6_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3575,13 +3575,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -3589,15 +3589,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -3609,8 +3609,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 @@ -3618,8 +3618,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -3628,59 +3628,59 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-DPP-NEXT: .LBB6_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3693,36 +3693,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm @@ -3730,32 +3730,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3768,70 +3768,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3844,38 +3844,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -3884,7 +3884,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3892,16 +3892,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3923,18 +3923,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3942,8 +3942,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3953,22 +3953,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -3987,26 +3987,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4019,19 +4019,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -4042,15 +4042,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -4077,19 +4077,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42] @@ -4097,8 +4097,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -4107,43 +4107,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 ; GFX7LESS-NEXT: .LBB7_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4155,14 +4155,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4189,12 +4189,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB7_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4205,54 +4205,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB7_4 ; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4264,14 +4264,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4298,12 +4298,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB7_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -4313,56 +4313,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1064-NEXT: .LBB7_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4374,14 +4374,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4402,17 +4402,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB7_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -4422,38 +4422,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1032-NEXT: .LBB7_5: ; GFX1032-NEXT: s_endpgm @@ -4461,11 +4461,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4473,11 +4473,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -4511,12 +4511,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB7_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start @@ -4530,15 +4530,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off @@ -4553,8 +4553,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4563,7 +4563,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -4572,9 +4572,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -4606,19 +4606,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start @@ -4632,16 +4632,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s45 +; GFX1132-NEXT: v_mov_b32_e32 v3, s53 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s44 +; GFX1132-NEXT: v_mov_b32_e32 v2, s52 ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 @@ -4653,8 +4653,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4663,22 +4663,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -4689,34 +4689,34 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -4725,43 +4725,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4773,14 +4773,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4834,20 +4834,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[52:53], s[52:53] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -4856,54 +4856,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4915,14 +4915,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4975,10 +4975,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] @@ -4989,56 +4989,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -5050,14 +5050,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -5098,15 +5098,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -5116,38 +5116,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -5155,11 +5155,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -5167,11 +5167,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -5239,11 +5239,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start @@ -5261,18 +5261,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -5280,8 +5280,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -5290,7 +5290,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5299,9 +5299,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -5356,16 +5356,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start @@ -5381,26 +5381,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6162,12 +6162,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -6187,8 +6187,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 @@ -6683,13 +6683,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -6697,15 +6697,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -6717,8 +6717,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 @@ -6726,8 +6726,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -6736,59 +6736,59 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start @@ -6801,36 +6801,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm @@ -6838,32 +6838,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s10 -; GFX1064-NEXT: s_mov_b32 s42, s9 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s50, s9 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -6876,70 +6876,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s10 -; GFX1032-NEXT: s_mov_b32 s42, s9 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s50, s9 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -6952,38 +6952,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm @@ -6992,7 +6992,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7000,16 +7000,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s10 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -7031,18 +7031,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7050,8 +7050,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-NEXT: .LBB10_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7061,22 +7061,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -7095,26 +7095,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-NEXT: .LBB10_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7123,13 +7123,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -7137,15 +7137,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -7157,8 +7157,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 @@ -7166,8 +7166,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -7176,59 +7176,59 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-DPP-NEXT: .LBB10_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7241,36 +7241,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm @@ -7278,32 +7278,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7316,70 +7316,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7392,38 +7392,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -7432,7 +7432,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7440,16 +7440,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7471,18 +7471,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7490,8 +7490,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7501,22 +7501,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -7535,26 +7535,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7567,19 +7567,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -7590,15 +7590,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -7625,19 +7625,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42] @@ -7645,8 +7645,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -7655,43 +7655,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 ; GFX7LESS-NEXT: .LBB11_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -7703,14 +7703,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7737,12 +7737,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB11_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7753,54 +7753,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB11_4 ; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -7812,14 +7812,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7846,12 +7846,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB11_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -7861,56 +7861,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -7922,14 +7922,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7950,17 +7950,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB11_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -7970,38 +7970,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm @@ -8009,11 +8009,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8021,11 +8021,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -8059,12 +8059,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB11_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start @@ -8078,15 +8078,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off @@ -8101,8 +8101,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8111,7 +8111,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -8120,9 +8120,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -8154,19 +8154,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB11_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start @@ -8180,16 +8180,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s45 +; GFX1132-NEXT: v_mov_b32_e32 v3, s53 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s44 +; GFX1132-NEXT: v_mov_b32_e32 v2, s52 ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 @@ -8201,8 +8201,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8211,22 +8211,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8237,34 +8237,34 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -8273,43 +8273,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8321,14 +8321,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8382,20 +8382,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[52:53], s[52:53] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -8404,54 +8404,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8463,14 +8463,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8523,10 +8523,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] @@ -8537,56 +8537,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8598,14 +8598,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8646,15 +8646,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -8664,38 +8664,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -8703,11 +8703,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8715,11 +8715,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -8787,11 +8787,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start @@ -8809,18 +8809,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8828,8 +8828,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8838,7 +8838,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8847,9 +8847,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -8904,16 +8904,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start @@ -8929,26 +8929,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index cb3291df891af..93c7ec3067b96 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -635,12 +635,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -660,8 +660,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 @@ -1674,12 +1674,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -1699,8 +1699,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 @@ -2713,12 +2713,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -2738,8 +2738,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 @@ -3135,13 +3135,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -3149,15 +3149,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -3169,8 +3169,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 @@ -3178,8 +3178,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -3188,59 +3188,59 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3253,36 +3253,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm @@ -3290,32 +3290,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s10 -; GFX1064-NEXT: s_mov_b32 s42, s9 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s50, s9 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -3328,70 +3328,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s10 -; GFX1032-NEXT: s_mov_b32 s42, s9 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s50, s9 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -3404,38 +3404,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm @@ -3444,7 +3444,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3452,16 +3452,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s10 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -3483,18 +3483,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3502,8 +3502,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-NEXT: .LBB6_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3513,22 +3513,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -3547,26 +3547,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-NEXT: .LBB6_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3575,13 +3575,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -3589,15 +3589,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -3609,8 +3609,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 @@ -3618,8 +3618,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -3628,59 +3628,59 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-DPP-NEXT: .LBB6_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3693,36 +3693,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm @@ -3730,32 +3730,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3768,70 +3768,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3844,38 +3844,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -3884,7 +3884,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3892,16 +3892,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3923,18 +3923,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3942,8 +3942,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3953,22 +3953,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -3987,26 +3987,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4019,19 +4019,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -4042,15 +4042,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -4077,19 +4077,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42] @@ -4097,8 +4097,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -4107,43 +4107,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 ; GFX7LESS-NEXT: .LBB7_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4155,14 +4155,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4189,12 +4189,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB7_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4205,54 +4205,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB7_4 ; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4264,14 +4264,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4298,12 +4298,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB7_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -4313,56 +4313,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1064-NEXT: .LBB7_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4374,14 +4374,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4402,17 +4402,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB7_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -4422,38 +4422,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1032-NEXT: .LBB7_5: ; GFX1032-NEXT: s_endpgm @@ -4461,11 +4461,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4473,11 +4473,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -4511,12 +4511,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB7_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start @@ -4530,15 +4530,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off @@ -4553,8 +4553,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4563,7 +4563,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -4572,9 +4572,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -4606,19 +4606,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start @@ -4632,16 +4632,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s45 +; GFX1132-NEXT: v_mov_b32_e32 v3, s53 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s44 +; GFX1132-NEXT: v_mov_b32_e32 v2, s52 ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 @@ -4653,8 +4653,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4663,22 +4663,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -4689,34 +4689,34 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -4725,43 +4725,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4773,14 +4773,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4834,20 +4834,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[52:53], s[52:53] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -4856,54 +4856,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -4915,14 +4915,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -4975,10 +4975,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] @@ -4989,56 +4989,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -5050,14 +5050,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -5098,15 +5098,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -5116,38 +5116,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -5155,11 +5155,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -5167,11 +5167,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -5239,11 +5239,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start @@ -5261,18 +5261,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -5280,8 +5280,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -5290,7 +5290,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5299,9 +5299,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -5356,16 +5356,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start @@ -5381,26 +5381,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6162,12 +6162,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -6187,8 +6187,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 @@ -6683,13 +6683,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -6697,15 +6697,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -6717,8 +6717,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 @@ -6726,8 +6726,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -6736,59 +6736,59 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start @@ -6801,36 +6801,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm @@ -6838,32 +6838,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s10 -; GFX1064-NEXT: s_mov_b32 s42, s9 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s50, s9 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -6876,70 +6876,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s10 -; GFX1032-NEXT: s_mov_b32 s42, s9 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s50, s9 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -6952,38 +6952,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm @@ -6992,7 +6992,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7000,16 +7000,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s10 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -7031,18 +7031,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7050,8 +7050,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-NEXT: .LBB10_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7061,22 +7061,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -7095,26 +7095,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-NEXT: .LBB10_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7123,13 +7123,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -7137,15 +7137,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -7157,8 +7157,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 @@ -7166,8 +7166,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -7176,59 +7176,59 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-DPP-NEXT: .LBB10_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7241,36 +7241,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm @@ -7278,32 +7278,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7316,70 +7316,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7392,38 +7392,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -7432,7 +7432,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7440,16 +7440,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7471,18 +7471,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7490,8 +7490,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7501,22 +7501,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -7535,26 +7535,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7567,19 +7567,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -7590,15 +7590,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -7625,19 +7625,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42] @@ -7645,8 +7645,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 @@ -7655,43 +7655,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 ; GFX7LESS-NEXT: .LBB11_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -7703,14 +7703,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7737,12 +7737,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB11_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7753,54 +7753,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB11_4 ; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -7812,14 +7812,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7846,12 +7846,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB11_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -7861,56 +7861,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -7922,14 +7922,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7950,17 +7950,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB11_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53] ; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -7970,38 +7970,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm @@ -8009,11 +8009,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8021,11 +8021,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -8059,12 +8059,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB11_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start @@ -8078,15 +8078,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off @@ -8101,8 +8101,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8111,7 +8111,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -8120,9 +8120,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -8154,19 +8154,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB11_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start @@ -8180,16 +8180,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s45 +; GFX1132-NEXT: v_mov_b32_e32 v3, s53 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s44 +; GFX1132-NEXT: v_mov_b32_e32 v2, s52 ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 @@ -8201,8 +8201,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8211,22 +8211,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8237,34 +8237,34 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 @@ -8273,43 +8273,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8321,14 +8321,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8382,20 +8382,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[52:53], s[52:53] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -8404,54 +8404,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8463,14 +8463,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8523,10 +8523,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] @@ -8537,56 +8537,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8598,14 +8598,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8646,15 +8646,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -8664,38 +8664,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -8703,11 +8703,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 @@ -8715,11 +8715,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -8787,11 +8787,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start @@ -8809,18 +8809,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8828,8 +8828,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8838,7 +8838,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8847,9 +8847,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -8904,16 +8904,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start @@ -8929,26 +8929,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 6dc3a1971a485..597260579f057 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -821,12 +821,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -846,8 +846,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -2153,12 +2153,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -2178,8 +2178,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -3485,12 +3485,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -3510,8 +3510,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -4313,12 +4313,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -4338,8 +4338,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -5644,12 +5644,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -5669,8 +5669,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 @@ -6075,14 +6075,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 @@ -6091,16 +6091,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 ; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v4 @@ -6113,11 +6113,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -6130,42 +6130,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 @@ -6173,21 +6173,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 ; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s42, s9 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s50, s9 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start @@ -6200,68 +6200,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v3, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v4, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b32 s33, s10 ; GFX1064-NEXT: s_mov_b64 s[10:11], exec -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s10, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s11, v3 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6278,69 +6278,69 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_mov_b32 s9, exec_lo -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s9 ; GFX1032-NEXT: s_mov_b32 s33, s10 ; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6357,37 +6357,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm @@ -6398,7 +6398,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6407,16 +6407,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -6439,18 +6439,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6458,8 +6458,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-NEXT: .LBB9_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6470,24 +6470,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -6507,25 +6507,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-NEXT: .LBB9_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6534,14 +6534,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 @@ -6550,16 +6550,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v4, v0, v4 @@ -6572,11 +6572,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -6589,42 +6589,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-DPP-NEXT: .LBB9_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 @@ -6632,21 +6632,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start @@ -6659,68 +6659,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], exec -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s10, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s11, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6737,69 +6737,69 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_mov_b32 s9, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s9 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6816,37 +6816,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -6857,7 +6857,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6866,16 +6866,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11] -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6898,18 +6898,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6917,8 +6917,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6929,24 +6929,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6966,25 +6966,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6997,19 +6997,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -7020,15 +7020,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -7053,21 +7053,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -7080,44 +7080,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 ; GFX7LESS-NEXT: .LBB10_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7129,14 +7129,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7161,11 +7161,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB10_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7176,53 +7176,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB10_4 ; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7234,14 +7234,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7266,11 +7266,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB10_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -7280,55 +7280,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7340,14 +7340,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7366,16 +7366,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB10_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -7385,37 +7385,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm @@ -7423,11 +7423,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7435,11 +7435,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -7471,11 +7471,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB10_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start @@ -7492,18 +7492,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7511,8 +7511,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7521,7 +7521,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -7530,9 +7530,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -7561,17 +7561,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start @@ -7586,25 +7586,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7613,22 +7613,22 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7639,30 +7639,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -7675,44 +7675,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7724,14 +7724,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7778,74 +7778,74 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[44:45] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[52:53] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7857,14 +7857,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -7909,10 +7909,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) @@ -7922,55 +7922,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -7982,14 +7982,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -8024,14 +8024,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -8041,37 +8041,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -8079,11 +8079,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -8091,11 +8091,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -8153,10 +8153,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -8173,18 +8173,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8192,8 +8192,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8202,7 +8202,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8211,9 +8211,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -8264,14 +8264,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -8286,25 +8286,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -9249,12 +9249,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -9274,8 +9274,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 @@ -10682,12 +10682,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -10707,8 +10707,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 @@ -11597,12 +11597,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 @@ -11622,8 +11622,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 @@ -12097,13 +12097,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 @@ -12117,15 +12117,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -12137,11 +12137,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -12154,40 +12154,40 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 @@ -12200,20 +12200,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-NEXT: s_mov_b32 s33, s10 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start @@ -12226,48 +12226,48 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 @@ -12279,19 +12279,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-NEXT: s_mov_b32 s33, s10 -; GFX1064-NEXT: s_mov_b32 s42, s9 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b32 s50, s9 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -12304,53 +12304,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] @@ -12359,18 +12359,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-NEXT: s_mov_b32 s33, s10 -; GFX1032-NEXT: s_mov_b32 s42, s9 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b32 s50, s9 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -12383,44 +12383,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12441,16 +12441,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s10 -; GFX1164-NEXT: s_mov_b32 s42, s9 -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s50, s9 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -12471,18 +12471,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12490,8 +12490,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-NEXT: .LBB16_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12499,12 +12499,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12519,15 +12519,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12545,25 +12545,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-NEXT: .LBB16_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12572,13 +12572,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -12592,15 +12592,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -12612,11 +12612,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -12629,40 +12629,40 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-DPP-NEXT: .LBB16_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX9-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s66, -1 +; GFX9-DPP-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -12675,20 +12675,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX9-DPP-NEXT: s_mov_b32 s33, s10 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start @@ -12701,48 +12701,48 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 @@ -12754,19 +12754,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12779,53 +12779,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] @@ -12834,18 +12834,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[52:53], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12858,44 +12858,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12916,16 +12916,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12946,18 +12946,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12965,8 +12965,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12974,12 +12974,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12994,15 +12994,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[52:53], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -13020,25 +13020,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13051,19 +13051,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -13074,15 +13074,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec @@ -13107,21 +13107,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execz .LBB17_5 ; GFX7LESS-NEXT: ; %bb.3: -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -13134,44 +13134,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 ; GFX7LESS-NEXT: .LBB17_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s11 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s66, -1 +; GFX9-NEXT: s_mov_b32 s67, 0xe00000 +; GFX9-NEXT: s_add_u32 s64, s64, s11 +; GFX9-NEXT: s_addc_u32 s65, s65, 0 ; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b32 s51, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s50, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13183,14 +13183,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13215,11 +13215,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB17_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[52:53], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-NEXT: s_mov_b64 s[54:55], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -13230,53 +13230,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX9-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[64:67], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s43 -; GFX9-NEXT: s_mov_b32 s13, s42 +; GFX9-NEXT: s_mov_b32 s12, s51 +; GFX9-NEXT: s_mov_b32 s13, s50 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-NEXT: v_mov_b32_e32 v2, s52 +; GFX9-NEXT: v_mov_b32_e32 v3, s53 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX9-NEXT: s_cbranch_execnz .LBB17_4 ; GFX9-NEXT: .LBB17_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s66, -1 +; GFX1064-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-NEXT: s_mov_b32 s51, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s50, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13288,14 +13288,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s12, s51 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13320,11 +13320,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB17_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -13334,55 +13334,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s43 -; GFX1064-NEXT: s_mov_b32 s13, s42 +; GFX1064-NEXT: s_mov_b32 s12, s51 +; GFX1064-NEXT: s_mov_b32 s13, s50 ; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s66, -1 +; GFX1032-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-NEXT: s_mov_b32 s51, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s50, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13394,14 +13394,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s12, s51 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13420,16 +13420,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s46, 0 +; GFX1032-NEXT: s_mov_b32 s54, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB17_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -13439,37 +13439,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s43 -; GFX1032-NEXT: s_mov_b32 s13, s42 +; GFX1032-NEXT: s_mov_b32 s12, s51 +; GFX1032-NEXT: s_mov_b32 s13, s50 ; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm @@ -13477,11 +13477,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b32 s51, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s50, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13489,11 +13489,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b32 s33, s10 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 @@ -13525,11 +13525,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB17_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start @@ -13546,18 +13546,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s43 -; GFX1164-NEXT: s_mov_b32 s13, s42 +; GFX1164-NEXT: s_mov_b32 s12, s51 +; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13565,8 +13565,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13575,7 +13575,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -13584,9 +13584,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-NEXT: s_mov_b32 s42, s14 -; GFX1132-NEXT: s_mov_b32 s43, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b32 s50, s14 +; GFX1132-NEXT: s_mov_b32 s51, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 @@ -13615,17 +13615,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s46, 0 +; GFX1132-NEXT: s_mov_b32 s54, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB17_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start @@ -13640,25 +13640,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s43 -; GFX1132-NEXT: s_mov_b32 s13, s42 +; GFX1132-NEXT: s_mov_b32 s12, s51 +; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13667,22 +13667,22 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s66, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s67, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s64, s64, s11 +; GFX7LESS-DPP-NEXT: s_addc_u32 s65, s65, 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 -; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 -; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] @@ -13693,30 +13693,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[52:55], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[64:67], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:8 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -13729,44 +13729,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s51 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s50 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s52 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[64:67], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX7LESS-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s54, -1 -; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 -; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s70, -1 +; GFX9-DPP-NEXT: s_mov_b32 s71, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s68, s68, s11 +; GFX9-DPP-NEXT: s_addc_u32 s69, s69, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b32 s51, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s50, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13778,14 +13778,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13832,74 +13832,74 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s53, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s52, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[54:55], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[64:65], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[54:55] ; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[44:45] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[52:53] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[68:69] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[68:71], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[68:71], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[68:71], 0 offset:8 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s43 -; GFX9-DPP-NEXT: s_mov_b32 s13, s42 +; GFX9-DPP-NEXT: s_mov_b32 s12, s51 +; GFX9-DPP-NEXT: s_mov_b32 s13, s50 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[70:71] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s54 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s55 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[68:71], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[68:71], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] +; GFX9-DPP-NEXT: s_or_b64 s[64:65], vcc, s[64:65] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[64:65] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s67, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -13911,14 +13911,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -13963,10 +13963,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) @@ -13976,55 +13976,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX1064-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[54:55] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b32 s64, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s65, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s66, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s67, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s64, s64, s11 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_addc_u32 s65, s65, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -14036,14 +14036,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -14078,14 +14078,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53] ; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -14095,37 +14095,37 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[64:67], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[64:65] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[66:67] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[64:67], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 +; GFX1032-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -14133,11 +14133,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b32 s51, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s50, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 @@ -14145,11 +14145,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 @@ -14207,10 +14207,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[54:55], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start @@ -14227,18 +14227,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -14246,8 +14246,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] +; GFX1164-DPP-NEXT: s_or_b64 s[54:55], vcc, s[54:55] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[54:55] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -14256,7 +14256,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -14265,9 +14265,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] -; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b32 s50, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s51, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 @@ -14318,14 +14318,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s54, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[52:53], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[52:53] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start @@ -14340,25 +14340,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 +; GFX1132-DPP-NEXT: s_or_b32 s54, vcc_lo, s54 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s54 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir index dde84af57ed25..da1175c02e94a 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir @@ -31,102 +31,105 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr34_sgpr35 = COPY $sgpr8_sgpr9 ; CHECK-NEXT: renamable $sgpr33 = COPY $sgpr15 - ; CHECK-NEXT: renamable $sgpr42 = COPY $sgpr14 + ; CHECK-NEXT: renamable $sgpr50 = COPY $sgpr14 ; CHECK-NEXT: renamable $sgpr36_sgpr37 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: renamable $sgpr38_sgpr39 = COPY $sgpr6_sgpr7 - ; CHECK-NEXT: renamable $sgpr40_sgpr41 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: renamable $sgpr66_sgpr67 = S_LOAD_DWORDX2_IMM renamable $sgpr34_sgpr35, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4) - ; CHECK-NEXT: renamable $sgpr44 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr45 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr46 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr47 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr48 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr49 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr50 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr51 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr52 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr53 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr54 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr55 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr58 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr59 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr60 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr61 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr62 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr63 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr64 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr68_sgpr69 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: renamable $sgpr64_sgpr65 = S_LOAD_DWORDX2_IMM renamable $sgpr34_sgpr35, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4) + ; CHECK-NEXT: renamable $sgpr68 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr69 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr70 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr71 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr72 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr73 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr74 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr75 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr76 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr77 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr78 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr79 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr80 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr81 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr82 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr83 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr84 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr85 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr86 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr87 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr88 = S_MOV_B32 0 + ; CHECK-NEXT: SI_SPILL_S1024_SAVE renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr52_sgpr53 = IMPLICIT_DEF ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL renamable $sgpr68_sgpr69, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL renamable $sgpr52_sgpr53, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY killed renamable $sgpr40_sgpr41 + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY killed renamable $sgpr48_sgpr49 ; CHECK-NEXT: $sgpr6_sgpr7 = COPY killed renamable $sgpr38_sgpr39 ; CHECK-NEXT: $sgpr8_sgpr9 = COPY killed renamable $sgpr34_sgpr35 ; CHECK-NEXT: $sgpr10_sgpr11 = COPY killed renamable $sgpr36_sgpr37 - ; CHECK-NEXT: $sgpr12 = COPY killed renamable $sgpr42 + ; CHECK-NEXT: $sgpr12 = COPY killed renamable $sgpr50 ; CHECK-NEXT: $sgpr13 = COPY killed renamable $sgpr33 - ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr68_sgpr69, 0, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr52_sgpr53, 0, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: renamable $sgpr4_sgpr5 = COPY $exec, implicit-def $exec ; CHECK-NEXT: dead renamable $sgpr6_sgpr7 = IMPLICIT_DEF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) - ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr66_sgpr67:0x000000000000000F, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF + ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr64_sgpr65:0x000000000000000F ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr6_sgpr7 = COPY $exec, implicit-def $exec ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr66_sgpr67:0x000000000000000F, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF + ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr64_sgpr65:0x000000000000000F ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 - ; CHECK-NEXT: renamable $sgpr6 = S_LSHL_B32 renamable $sgpr67, 1, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 + ; CHECK-NEXT: renamable $sgpr6 = S_LSHL_B32 renamable $sgpr65, 1, implicit-def dead $scc ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[COPY]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.1(0x40000000) - ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr66_sgpr67:0x000000000000000F, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF + ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr64_sgpr65:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr6_sgpr7 = S_OR_SAVEEXEC_B64 renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK-NEXT: renamable $sgpr68 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr69 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr70 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr71 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr72 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr73 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr74 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr75 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr76 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr77 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr78 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr79 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr80 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr81 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr82 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr83 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr84 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr85 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr86 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr87 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr88 = COPY renamable $sgpr44 - ; CHECK-NEXT: renamable $sgpr89 = COPY renamable $sgpr44 - ; CHECK-NEXT: dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, implicit $exec + ; CHECK-NEXT: renamable $sgpr36 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr37 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr38 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr39 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr40 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr41 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr42 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr43 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr44 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr45 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr46 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr47 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr48 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr49 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr50 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr51 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr52 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr53 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr54 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr55 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr56 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr57 = COPY killed renamable $sgpr68 + ; CHECK-NEXT: dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) - ; CHECK-NEXT: liveins: $sgpr6_sgpr7, $sgpr66_sgpr67:0x0000000000000003, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF + ; CHECK-NEXT: liveins: $sgpr6_sgpr7, $sgpr64_sgpr65:0x0000000000000003 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc - ; CHECK-NEXT: dead renamable $sgpr4 = S_LSHL_B32 killed renamable $sgpr66, 1, implicit-def dead $scc - ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 + ; CHECK-NEXT: dead renamable $sgpr4 = S_LSHL_B32 killed renamable $sgpr64, 1, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_1024 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: bb.0: @@ -211,7 +214,6 @@ body: | %15.sub19:sgpr_1024 = COPY %7.sub0 %15.sub20:sgpr_1024 = COPY %7.sub0 %15.sub21:sgpr_1024 = COPY %7.sub0 - ; Spill code ends up getting inserted here, and we end up with many unspillable sgpr1024 ranges %16:vreg_1024 = COPY %15, implicit $exec $exec = S_XOR_B64_term $exec, %14, implicit-def $scc S_CBRANCH_EXECZ %bb.5, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/greedy-liverange-priority.mir b/llvm/test/CodeGen/AMDGPU/greedy-liverange-priority.mir index ffa205b2776ad..c4ef72416588a 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-liverange-priority.mir +++ b/llvm/test/CodeGen/AMDGPU/greedy-liverange-priority.mir @@ -74,7 +74,7 @@ body: | %3:vgpr_32 = V_ADD_U32_e32 %30, %6, implicit $exec %4:vgpr_32 = V_ADD_U32_e32 %36.sub1, %8, implicit $exec %15:vgpr_32 = V_OR_B32_e32 %4, %3, implicit $exec - %21:sreg_32 = V_READFIRSTLANE_B32 %19, implicit $exec + %21:sreg_32_xm0 = V_READFIRSTLANE_B32 %19, implicit $exec %17:sreg_32 = V_CMP_EQ_U32_e64 0, %15, implicit $exec S_CMP_LG_U32 %21, 0, implicit-def $scc %31:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 05ff642845628..efdd979c54daf 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -14,71 +14,53 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v5, s37, 1 ; CHECK-NEXT: v_writelane_b32 v5, s38, 2 ; CHECK-NEXT: v_writelane_b32 v5, s39, 3 -; CHECK-NEXT: v_writelane_b32 v5, s40, 4 -; CHECK-NEXT: v_writelane_b32 v5, s41, 5 -; CHECK-NEXT: v_writelane_b32 v5, s42, 6 -; CHECK-NEXT: v_writelane_b32 v5, s43, 7 -; CHECK-NEXT: v_writelane_b32 v5, s44, 8 -; CHECK-NEXT: v_writelane_b32 v5, s45, 9 -; CHECK-NEXT: v_writelane_b32 v5, s46, 10 -; CHECK-NEXT: v_writelane_b32 v5, s47, 11 -; CHECK-NEXT: v_writelane_b32 v5, s48, 12 -; CHECK-NEXT: v_writelane_b32 v5, s49, 13 -; CHECK-NEXT: v_writelane_b32 v5, s50, 14 -; CHECK-NEXT: v_writelane_b32 v5, s51, 15 -; CHECK-NEXT: v_writelane_b32 v5, s52, 16 -; CHECK-NEXT: v_writelane_b32 v5, s53, 17 -; CHECK-NEXT: v_writelane_b32 v5, s54, 18 -; CHECK-NEXT: v_writelane_b32 v5, s55, 19 -; CHECK-NEXT: v_writelane_b32 v5, s56, 20 -; CHECK-NEXT: v_writelane_b32 v5, s57, 21 -; CHECK-NEXT: v_writelane_b32 v5, s58, 22 -; CHECK-NEXT: v_writelane_b32 v5, s59, 23 -; CHECK-NEXT: v_writelane_b32 v5, s60, 24 -; CHECK-NEXT: v_writelane_b32 v5, s61, 25 -; CHECK-NEXT: v_writelane_b32 v5, s62, 26 -; CHECK-NEXT: v_writelane_b32 v5, s63, 27 -; CHECK-NEXT: v_writelane_b32 v5, s64, 28 -; CHECK-NEXT: v_writelane_b32 v5, s65, 29 -; CHECK-NEXT: v_writelane_b32 v5, s66, 30 -; CHECK-NEXT: v_writelane_b32 v5, s67, 31 -; CHECK-NEXT: v_writelane_b32 v5, s30, 32 -; CHECK-NEXT: v_writelane_b32 v5, s31, 33 +; CHECK-NEXT: v_writelane_b32 v5, s48, 4 +; CHECK-NEXT: v_writelane_b32 v5, s49, 5 +; CHECK-NEXT: v_writelane_b32 v5, s50, 6 +; CHECK-NEXT: v_writelane_b32 v5, s51, 7 +; CHECK-NEXT: v_writelane_b32 v5, s52, 8 +; CHECK-NEXT: v_writelane_b32 v5, s53, 9 +; CHECK-NEXT: v_writelane_b32 v5, s54, 10 +; CHECK-NEXT: v_writelane_b32 v5, s55, 11 +; CHECK-NEXT: v_writelane_b32 v5, s64, 12 +; CHECK-NEXT: v_writelane_b32 v5, s65, 13 +; CHECK-NEXT: v_writelane_b32 v5, s66, 14 +; CHECK-NEXT: v_writelane_b32 v5, s67, 15 ; CHECK-NEXT: s_getpc_b64 s[24:25] ; CHECK-NEXT: s_movk_i32 s4, 0xf0 ; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[44:59], s[4:5], 0x0 ; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[40:43], s[4:5], 0x0 ; CHECK-NEXT: s_movk_i32 s20, 0x130 ; CHECK-NEXT: s_mov_b32 s21, s24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v7, s36, 0 -; CHECK-NEXT: v_writelane_b32 v7, s37, 1 -; CHECK-NEXT: v_writelane_b32 v7, s38, 2 -; CHECK-NEXT: v_writelane_b32 v7, s39, 3 -; CHECK-NEXT: v_writelane_b32 v7, s40, 4 -; CHECK-NEXT: v_writelane_b32 v7, s41, 5 -; CHECK-NEXT: v_writelane_b32 v7, s42, 6 -; CHECK-NEXT: v_writelane_b32 v7, s43, 7 -; CHECK-NEXT: v_writelane_b32 v7, s44, 8 -; CHECK-NEXT: v_writelane_b32 v7, s45, 9 -; CHECK-NEXT: v_writelane_b32 v7, s46, 10 +; CHECK-NEXT: v_writelane_b32 v7, s44, 0 +; CHECK-NEXT: v_writelane_b32 v7, s45, 1 +; CHECK-NEXT: v_writelane_b32 v7, s46, 2 +; CHECK-NEXT: v_writelane_b32 v7, s47, 3 +; CHECK-NEXT: v_writelane_b32 v7, s48, 4 +; CHECK-NEXT: v_writelane_b32 v7, s49, 5 +; CHECK-NEXT: v_writelane_b32 v7, s50, 6 +; CHECK-NEXT: v_writelane_b32 v7, s51, 7 +; CHECK-NEXT: v_writelane_b32 v7, s52, 8 +; CHECK-NEXT: v_writelane_b32 v7, s53, 9 +; CHECK-NEXT: v_writelane_b32 v7, s54, 10 ; CHECK-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 -; CHECK-NEXT: v_writelane_b32 v7, s47, 11 -; CHECK-NEXT: v_writelane_b32 v7, s48, 12 +; CHECK-NEXT: v_writelane_b32 v7, s55, 11 +; CHECK-NEXT: v_writelane_b32 v7, s56, 12 ; CHECK-NEXT: s_mov_b32 s20, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_writelane_b32 v7, s49, 13 -; CHECK-NEXT: v_mov_b32_e32 v2, s28 +; CHECK-NEXT: v_writelane_b32 v7, s57, 13 +; CHECK-NEXT: v_mov_b32_e32 v2, s40 ; CHECK-NEXT: v_mov_b32_e32 v3, v1 ; CHECK-NEXT: s_mov_b32 s21, s20 ; CHECK-NEXT: s_mov_b32 s22, s20 ; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_writelane_b32 v7, s50, 14 -; CHECK-NEXT: v_writelane_b32 v7, s51, 15 -; CHECK-NEXT: image_sample_lz v3, v[2:3], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: v_writelane_b32 v7, s58, 14 +; CHECK-NEXT: v_writelane_b32 v7, s59, 15 +; CHECK-NEXT: image_sample_lz v3, v[2:3], s[52:59], s[20:23] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_writelane_b32 v7, s4, 16 @@ -102,9 +84,9 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v7, s19, 31 ; CHECK-NEXT: s_load_dwordx8 s[4:11], s[26:27], 0x0 ; CHECK-NEXT: s_movk_i32 s28, 0x1f0 -; CHECK-NEXT: s_movk_i32 s30, 0x2f0 +; CHECK-NEXT: s_movk_i32 s72, 0x2f0 ; CHECK-NEXT: s_mov_b32 s29, s24 -; CHECK-NEXT: s_mov_b32 s31, s24 +; CHECK-NEXT: s_mov_b32 s73, s24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_writelane_b32 v7, s4, 32 ; CHECK-NEXT: v_writelane_b32 v7, s5, 33 @@ -115,7 +97,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v7, s10, 38 ; CHECK-NEXT: v_writelane_b32 v7, s11, 39 ; CHECK-NEXT: s_load_dwordx16 s[52:67], s[28:29], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[4:19], s[72:73], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1 @@ -146,19 +128,19 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s43, v7, 7 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_readlane_b32 s36, v7, 32 -; CHECK-NEXT: v_readlane_b32 s40, v7, 36 -; CHECK-NEXT: v_readlane_b32 s41, v7, 37 -; CHECK-NEXT: v_readlane_b32 s42, v7, 38 -; CHECK-NEXT: v_readlane_b32 s43, v7, 39 +; CHECK-NEXT: v_readlane_b32 s40, v7, 32 +; CHECK-NEXT: v_readlane_b32 s44, v7, 36 +; CHECK-NEXT: v_readlane_b32 s45, v7, 37 +; CHECK-NEXT: v_readlane_b32 s46, v7, 38 +; CHECK-NEXT: v_readlane_b32 s47, v7, 39 ; CHECK-NEXT: s_mov_b32 s21, s20 ; CHECK-NEXT: s_mov_b32 s22, s20 ; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_readlane_b32 s37, v7, 33 -; CHECK-NEXT: v_readlane_b32 s38, v7, 34 +; CHECK-NEXT: v_readlane_b32 s41, v7, 33 +; CHECK-NEXT: v_readlane_b32 s42, v7, 34 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[60:67], s[40:43] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s39, v7, 35 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[60:67], s[44:47] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s43, v7, 35 ; CHECK-NEXT: image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_sub_f32_e32 v1, v1, v4 @@ -334,36 +316,18 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock ; CHECK-NEXT: s_or_b64 exec, exec, s[20:21] -; CHECK-NEXT: v_readlane_b32 s30, v5, 32 -; CHECK-NEXT: v_readlane_b32 s31, v5, 33 -; CHECK-NEXT: v_readlane_b32 s67, v5, 31 -; CHECK-NEXT: v_readlane_b32 s66, v5, 30 -; CHECK-NEXT: v_readlane_b32 s65, v5, 29 -; CHECK-NEXT: v_readlane_b32 s64, v5, 28 -; CHECK-NEXT: v_readlane_b32 s63, v5, 27 -; CHECK-NEXT: v_readlane_b32 s62, v5, 26 -; CHECK-NEXT: v_readlane_b32 s61, v5, 25 -; CHECK-NEXT: v_readlane_b32 s60, v5, 24 -; CHECK-NEXT: v_readlane_b32 s59, v5, 23 -; CHECK-NEXT: v_readlane_b32 s58, v5, 22 -; CHECK-NEXT: v_readlane_b32 s57, v5, 21 -; CHECK-NEXT: v_readlane_b32 s56, v5, 20 -; CHECK-NEXT: v_readlane_b32 s55, v5, 19 -; CHECK-NEXT: v_readlane_b32 s54, v5, 18 -; CHECK-NEXT: v_readlane_b32 s53, v5, 17 -; CHECK-NEXT: v_readlane_b32 s52, v5, 16 -; CHECK-NEXT: v_readlane_b32 s51, v5, 15 -; CHECK-NEXT: v_readlane_b32 s50, v5, 14 -; CHECK-NEXT: v_readlane_b32 s49, v5, 13 -; CHECK-NEXT: v_readlane_b32 s48, v5, 12 -; CHECK-NEXT: v_readlane_b32 s47, v5, 11 -; CHECK-NEXT: v_readlane_b32 s46, v5, 10 -; CHECK-NEXT: v_readlane_b32 s45, v5, 9 -; CHECK-NEXT: v_readlane_b32 s44, v5, 8 -; CHECK-NEXT: v_readlane_b32 s43, v5, 7 -; CHECK-NEXT: v_readlane_b32 s42, v5, 6 -; CHECK-NEXT: v_readlane_b32 s41, v5, 5 -; CHECK-NEXT: v_readlane_b32 s40, v5, 4 +; CHECK-NEXT: v_readlane_b32 s67, v5, 15 +; CHECK-NEXT: v_readlane_b32 s66, v5, 14 +; CHECK-NEXT: v_readlane_b32 s65, v5, 13 +; CHECK-NEXT: v_readlane_b32 s64, v5, 12 +; CHECK-NEXT: v_readlane_b32 s55, v5, 11 +; CHECK-NEXT: v_readlane_b32 s54, v5, 10 +; CHECK-NEXT: v_readlane_b32 s53, v5, 9 +; CHECK-NEXT: v_readlane_b32 s52, v5, 8 +; CHECK-NEXT: v_readlane_b32 s51, v5, 7 +; CHECK-NEXT: v_readlane_b32 s50, v5, 6 +; CHECK-NEXT: v_readlane_b32 s49, v5, 5 +; CHECK-NEXT: v_readlane_b32 s48, v5, 4 ; CHECK-NEXT: v_readlane_b32 s39, v5, 3 ; CHECK-NEXT: v_readlane_b32 s38, v5, 2 ; CHECK-NEXT: v_readlane_b32 s37, v5, 1 diff --git a/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll index 1113acb3c0305..ba1cb9b26dec6 100644 --- a/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll +++ b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll @@ -1,4 +1,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 -misched=gcn-iterative-ilp < %s | FileCheck %s ; Test should not result in build failure ; CHECK-LABEL: shouldNotReApply diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 74020c43a3ca3..b171bf42cce41 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -154,8 +154,7 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1 -; SI-MOVREL-NEXT: s_mov_b32 m0, s6 +; SI-MOVREL-NEXT: s_add_i32 m0, s6, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 @@ -183,8 +182,7 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -439,12 +437,12 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1 +; SI-MOVREL-NEXT: s_add_i32 m0, s6, 1 ; SI-MOVREL-NEXT: s_or_b32 s8, s8, 1 ; SI-MOVREL-NEXT: s_or_b32 s4, s23, 16 ; SI-MOVREL-NEXT: s_or_b32 s5, s22, 15 -; SI-MOVREL-NEXT: s_or_b32 s7, s21, 14 -; SI-MOVREL-NEXT: s_or_b32 s20, s20, 13 +; SI-MOVREL-NEXT: s_or_b32 s6, s21, 14 +; SI-MOVREL-NEXT: s_or_b32 s7, s20, 13 ; SI-MOVREL-NEXT: s_or_b32 s19, s19, 12 ; SI-MOVREL-NEXT: s_or_b32 s18, s18, 11 ; SI-MOVREL-NEXT: s_or_b32 s17, s17, 10 @@ -457,7 +455,6 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; SI-MOVREL-NEXT: s_or_b32 s10, s10, 3 ; SI-MOVREL-NEXT: s_or_b32 s9, s9, 2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 -; SI-MOVREL-NEXT: s_mov_b32 m0, s6 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 @@ -469,8 +466,8 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s17 ; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 ; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s19 -; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s20 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s6 ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s5 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s4 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 @@ -483,16 +480,16 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; VI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1 ; VI-MOVREL-NEXT: s_or_b32 s10, s10, 3 ; VI-MOVREL-NEXT: s_or_b32 s9, s9, 2 ; VI-MOVREL-NEXT: s_or_b32 s8, s8, 1 -; VI-MOVREL-NEXT: s_or_b32 s3, s23, 16 -; VI-MOVREL-NEXT: s_or_b32 s4, s22, 15 -; VI-MOVREL-NEXT: s_or_b32 s5, s21, 14 -; VI-MOVREL-NEXT: s_or_b32 s6, s20, 13 -; VI-MOVREL-NEXT: s_or_b32 s7, s19, 12 -; VI-MOVREL-NEXT: s_or_b32 s18, s18, 11 +; VI-MOVREL-NEXT: s_or_b32 s2, s23, 16 +; VI-MOVREL-NEXT: s_or_b32 s3, s22, 15 +; VI-MOVREL-NEXT: s_or_b32 s4, s21, 14 +; VI-MOVREL-NEXT: s_or_b32 s5, s20, 13 +; VI-MOVREL-NEXT: s_or_b32 s6, s19, 12 +; VI-MOVREL-NEXT: s_or_b32 s7, s18, 11 ; VI-MOVREL-NEXT: s_or_b32 s17, s17, 10 ; VI-MOVREL-NEXT: s_or_b32 s16, s16, 9 ; VI-MOVREL-NEXT: s_or_b32 s15, s15, 8 @@ -503,7 +500,6 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 @@ -511,12 +507,12 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 ; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s16 ; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s17 -; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 -; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s7 -; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s6 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s5 -; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s2 ; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 @@ -885,10 +881,8 @@ define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %o ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) ; GENERIC-NEXT: s_addk_i32 s2, 0xfe00 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 1 -; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GENERIC-NEXT: s_cselect_b32 s4, 1, 0 ; GENERIC-NEXT: s_cmp_lg_u32 s2, 2 -; GENERIC-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GENERIC-NEXT: v_readfirstlane_b32 s4, v0 ; GENERIC-NEXT: s_cselect_b32 s4, s4, 2 ; GENERIC-NEXT: s_cmp_lg_u32 s2, 3 ; GENERIC-NEXT: s_cselect_b32 s4, s4, 3 @@ -2081,7 +2075,7 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1 +; SI-MOVREL-NEXT: s_add_i32 m0, s6, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -2096,7 +2090,6 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 -; SI-MOVREL-NEXT: s_mov_b32 m0, s6 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 @@ -2114,8 +2107,7 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 @@ -2437,7 +2429,7 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_and_b32 s4, s6, 0xffff +; SI-MOVREL-NEXT: s_and_b32 m0, s6, 0xffff ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -2452,7 +2444,6 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 -; SI-MOVREL-NEXT: s_mov_b32 m0, s4 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 @@ -2470,8 +2461,7 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_and_b32 s2, s2, 0xffff -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_and_b32 m0, s2, 0xffff ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 @@ -2796,7 +2786,7 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; SI-MOVREL-NEXT: s_sext_i32_i16 s4, s6 -; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; SI-MOVREL-NEXT: s_add_i32 m0, s4, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -2811,7 +2801,6 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 -; SI-MOVREL-NEXT: s_mov_b32 m0, s4 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 @@ -2830,8 +2819,7 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; VI-MOVREL-NEXT: s_sext_i32_i16 s2, s2 -; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 @@ -3319,57 +3307,56 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) ; GENERIC-NEXT: s_addk_i32 s6, 0xfe00 -; GENERIC-NEXT: s_cmp_eq_u32 s6, 0 -; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 3 -; GENERIC-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GENERIC-NEXT: s_cselect_b32 s4, 16, 3 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 2 -; GENERIC-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GENERIC-NEXT: s_cselect_b32 s5, 16, 2 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 1 ; GENERIC-NEXT: v_mov_b32_e32 v3, s4 ; GENERIC-NEXT: s_cselect_b32 s4, 16, 1 -; GENERIC-NEXT: s_cmp_eq_u32 s6, 7 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 0 ; GENERIC-NEXT: v_mov_b32_e32 v2, s5 -; GENERIC-NEXT: s_cselect_b32 s5, 16, 7 -; GENERIC-NEXT: s_cmp_eq_u32 s6, 6 +; GENERIC-NEXT: s_cselect_b32 s5, 16, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 7 ; GENERIC-NEXT: v_mov_b32_e32 v1, s4 -; GENERIC-NEXT: s_cselect_b32 s4, 16, 6 +; GENERIC-NEXT: s_cselect_b32 s4, 16, 7 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 6 +; GENERIC-NEXT: v_mov_b32_e32 v0, s5 +; GENERIC-NEXT: s_cselect_b32 s5, 16, 6 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 5 -; GENERIC-NEXT: v_mov_b32_e32 v7, s5 -; GENERIC-NEXT: s_cselect_b32 s5, 16, 5 +; GENERIC-NEXT: v_mov_b32_e32 v7, s4 +; GENERIC-NEXT: s_cselect_b32 s4, 16, 5 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 4 -; GENERIC-NEXT: v_mov_b32_e32 v6, s4 -; GENERIC-NEXT: s_cselect_b32 s4, 16, 4 +; GENERIC-NEXT: v_mov_b32_e32 v6, s5 +; GENERIC-NEXT: s_cselect_b32 s5, 16, 4 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 11 -; GENERIC-NEXT: v_mov_b32_e32 v5, s5 -; GENERIC-NEXT: s_cselect_b32 s5, 16, 11 +; GENERIC-NEXT: v_mov_b32_e32 v5, s4 +; GENERIC-NEXT: s_cselect_b32 s4, 16, 11 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 10 -; GENERIC-NEXT: v_mov_b32_e32 v4, s4 +; GENERIC-NEXT: v_mov_b32_e32 v4, s5 ; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GENERIC-NEXT: s_cselect_b32 s4, 16, 10 +; GENERIC-NEXT: s_cselect_b32 s5, 16, 10 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 9 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_mov_b32_e32 v7, s5 -; GENERIC-NEXT: s_cselect_b32 s5, 16, 9 +; GENERIC-NEXT: v_mov_b32_e32 v7, s4 +; GENERIC-NEXT: s_cselect_b32 s4, 16, 9 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 8 -; GENERIC-NEXT: v_mov_b32_e32 v6, s4 -; GENERIC-NEXT: s_cselect_b32 s4, 16, 8 +; GENERIC-NEXT: v_mov_b32_e32 v6, s5 +; GENERIC-NEXT: s_cselect_b32 s5, 16, 8 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 15 -; GENERIC-NEXT: v_mov_b32_e32 v5, s5 -; GENERIC-NEXT: s_cselect_b32 s5, 16, 15 +; GENERIC-NEXT: v_mov_b32_e32 v5, s4 +; GENERIC-NEXT: s_cselect_b32 s4, 16, 15 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 14 -; GENERIC-NEXT: v_mov_b32_e32 v4, s4 -; GENERIC-NEXT: s_cselect_b32 s4, 16, 14 +; GENERIC-NEXT: v_mov_b32_e32 v4, s5 +; GENERIC-NEXT: s_cselect_b32 s5, 16, 14 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 13 ; GENERIC-NEXT: s_cselect_b32 s7, 16, 13 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 12 ; GENERIC-NEXT: s_cselect_b32 s6, 16, 12 ; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_mov_b32_e32 v7, s5 -; GENERIC-NEXT: v_mov_b32_e32 v6, s4 +; GENERIC-NEXT: v_mov_b32_e32 v7, s4 +; GENERIC-NEXT: v_mov_b32_e32 v6, s5 ; GENERIC-NEXT: v_mov_b32_e32 v5, s7 ; GENERIC-NEXT: v_mov_b32_e32 v4, s6 ; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 @@ -6935,9 +6922,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 -; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s3, s2, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 m0, s2, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 @@ -6951,10 +6938,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 -; SI-MOVREL-NEXT: s_mov_b32 m0, s3 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 -; SI-MOVREL-NEXT: s_add_i32 s2, s2, 2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 +; SI-MOVREL-NEXT: s_add_i32 m0, s2, 2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 ; SI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 ; SI-MOVREL-NEXT: v_mov_b32_e32 v28, v12 @@ -6970,7 +6956,6 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v17, v1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v0 -; SI-MOVREL-NEXT: s_mov_b32 m0, s2 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32 @@ -6991,9 +6976,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 -; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_add_i32 s3, s2, 1 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 @@ -7007,11 +6992,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 -; VI-MOVREL-NEXT: s_mov_b32 m0, s3 -; VI-MOVREL-NEXT: s_add_i32 s2, s2, 2 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 2 ; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 ; VI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 @@ -8060,8 +8043,7 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s12, s12, 15 -; SI-MOVREL-NEXT: s_mov_b32 m0, s12 +; SI-MOVREL-NEXT: s_add_i32 m0, s12, 15 ; SI-MOVREL-NEXT: s_mov_b32 s0, s8 ; SI-MOVREL-NEXT: s_mov_b32 s1, s9 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 @@ -8092,9 +8074,8 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3 -; VI-MOVREL-NEXT: s_add_i32 s6, s6, 15 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2 -; VI-MOVREL-NEXT: s_mov_b32 m0, s6 +; VI-MOVREL-NEXT: s_add_i32 m0, s6, 15 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0 @@ -8324,8 +8305,7 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s12, s12, 16 -; SI-MOVREL-NEXT: s_mov_b32 m0, s12 +; SI-MOVREL-NEXT: s_add_i32 m0, s12, 16 ; SI-MOVREL-NEXT: s_mov_b32 s0, s8 ; SI-MOVREL-NEXT: s_mov_b32 s1, s9 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 @@ -8356,9 +8336,8 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3 -; VI-MOVREL-NEXT: s_add_i32 s6, s6, 16 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2 -; VI-MOVREL-NEXT: s_mov_b32 m0, s6 +; VI-MOVREL-NEXT: s_add_i32 m0, s6, 16 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0 @@ -8589,8 +8568,7 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: s_lshl_b32 s4, s12, 2 -; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_lshl_b32 m0, s12, 2 ; SI-MOVREL-NEXT: s_mov_b32 s0, s8 ; SI-MOVREL-NEXT: s_mov_b32 s1, s9 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1 @@ -8621,12 +8599,11 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3 -; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0 -; VI-MOVREL-NEXT: s_lshl_b32 s0, s6, 2 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2 -; VI-MOVREL-NEXT: s_mov_b32 m0, s0 +; VI-MOVREL-NEXT: s_lshl_b32 m0, s6, 2 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1 ; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1 ; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0 @@ -8865,7 +8842,7 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_lshl_b32 s4, s6, 2 +; SI-MOVREL-NEXT: s_lshl_b32 m0, s6, 2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 @@ -8882,7 +8859,6 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s21 ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s22 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s23 -; SI-MOVREL-NEXT: s_mov_b32 m0, s4 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 @@ -8898,9 +8874,8 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_lshl_b32 s2, s2, 2 +; VI-MOVREL-NEXT: s_lshl_b32 m0, s2, 2 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index 3e6143866bf88..a15bf7f32dc27 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -46,19 +46,15 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX12-NEXT: s_getpc_b64 s[6:7] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_sext_i32_i16 s7, s7 -; GFX12-NEXT: s_add_co_u32 s6, s6, snork@gotpcrel32@lo+12 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork@gotpcrel32@hi+24 +; GFX12-NEXT: s_add_co_u32 s6, s6, snork@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork@gotpcrel32@hi+16 ; GFX12-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], 0 ; GFX12-NEXT: s_getpc_b64 s[12:13] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_sext_i32_i16 s13, s13 -; GFX12-NEXT: s_add_co_u32 s12, s12, wobble@gotpcrel32@lo+12 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_ci_u32 s13, s13, wobble@gotpcrel32@hi+24 +; GFX12-NEXT: s_add_co_u32 s12, s12, wobble@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s13, s13, wobble@gotpcrel32@hi+16 ; GFX12-NEXT: s_load_u8 s14, s[4:5], 0x0 ; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX12-NEXT: s_load_b64 s[6:7], s[12:13], 0x0 @@ -67,13 +63,12 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX12-NEXT: s_mov_b32 s32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s12, 1, s14 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s12, 1 ; GFX12-NEXT: s_cselect_b32 s13, s7, s5 ; GFX12-NEXT: s_cselect_b32 s12, s6, s4 ; GFX12-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX12-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 23a7e16d72067..a41ef1a6c3418 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -134,59 +134,59 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s37, 3 ; GCN-NEXT: v_writelane_b32 v40, s38, 4 ; GCN-NEXT: v_writelane_b32 v40, s39, 5 -; GCN-NEXT: v_writelane_b32 v40, s40, 6 -; GCN-NEXT: v_writelane_b32 v40, s41, 7 -; GCN-NEXT: v_writelane_b32 v40, s42, 8 -; GCN-NEXT: v_writelane_b32 v40, s43, 9 -; GCN-NEXT: v_writelane_b32 v40, s44, 10 -; GCN-NEXT: v_writelane_b32 v40, s45, 11 -; GCN-NEXT: v_writelane_b32 v40, s46, 12 -; GCN-NEXT: v_writelane_b32 v40, s47, 13 -; GCN-NEXT: v_writelane_b32 v40, s48, 14 -; GCN-NEXT: v_writelane_b32 v40, s49, 15 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s64, 14 +; GCN-NEXT: v_writelane_b32 v40, s65, 15 ; GCN-NEXT: v_writelane_b32 v40, s30, 16 ; GCN-NEXT: v_writelane_b32 v40, s31, 17 -; GCN-NEXT: s_mov_b32 s42, s15 -; GCN-NEXT: s_mov_b32 s43, s14 -; GCN-NEXT: s_mov_b32 s44, s13 -; GCN-NEXT: s_mov_b32 s45, s12 +; GCN-NEXT: s_mov_b32 s50, s15 +; GCN-NEXT: s_mov_b32 s51, s14 +; GCN-NEXT: s_mov_b32 s52, s13 +; GCN-NEXT: s_mov_b32 s53, s12 ; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] -; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] -; GCN-NEXT: s_mov_b64 s[46:47], exec +; GCN-NEXT: s_mov_b64 s[48:49], s[4:5] +; GCN-NEXT: s_mov_b64 s[54:55], exec ; GCN-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_and_saveexec_b64 s[64:65], vcc +; GCN-NEXT: s_mov_b64 s[4:5], s[48:49] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] ; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] -; GCN-NEXT: s_mov_b32 s12, s45 -; GCN-NEXT: s_mov_b32 s13, s44 -; GCN-NEXT: s_mov_b32 s14, s43 -; GCN-NEXT: s_mov_b32 s15, s42 +; GCN-NEXT: s_mov_b32 s12, s53 +; GCN-NEXT: s_mov_b32 s13, s52 +; GCN-NEXT: s_mov_b32 s14, s51 +; GCN-NEXT: s_mov_b32 s15, s50 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] +; GCN-NEXT: s_xor_b64 exec, exec, s[64:65] ; GCN-NEXT: s_cbranch_execnz .LBB2_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[46:47] +; GCN-NEXT: s_mov_b64 exec, s[54:55] ; GCN-NEXT: v_readlane_b32 s30, v40, 16 ; GCN-NEXT: v_readlane_b32 s31, v40, 17 -; GCN-NEXT: v_readlane_b32 s49, v40, 15 -; GCN-NEXT: v_readlane_b32 s48, v40, 14 -; GCN-NEXT: v_readlane_b32 s47, v40, 13 -; GCN-NEXT: v_readlane_b32 s46, v40, 12 -; GCN-NEXT: v_readlane_b32 s45, v40, 11 -; GCN-NEXT: v_readlane_b32 s44, v40, 10 -; GCN-NEXT: v_readlane_b32 s43, v40, 9 -; GCN-NEXT: v_readlane_b32 s42, v40, 8 -; GCN-NEXT: v_readlane_b32 s41, v40, 7 -; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s65, v40, 15 +; GCN-NEXT: v_readlane_b32 s64, v40, 14 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 ; GCN-NEXT: v_readlane_b32 s39, v40, 5 ; GCN-NEXT: v_readlane_b32 s38, v40, 4 ; GCN-NEXT: v_readlane_b32 s37, v40, 3 @@ -218,59 +218,59 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 3 ; GISEL-NEXT: v_writelane_b32 v40, s38, 4 ; GISEL-NEXT: v_writelane_b32 v40, s39, 5 -; GISEL-NEXT: v_writelane_b32 v40, s40, 6 -; GISEL-NEXT: v_writelane_b32 v40, s41, 7 -; GISEL-NEXT: v_writelane_b32 v40, s42, 8 -; GISEL-NEXT: v_writelane_b32 v40, s43, 9 -; GISEL-NEXT: v_writelane_b32 v40, s44, 10 -; GISEL-NEXT: v_writelane_b32 v40, s45, 11 -; GISEL-NEXT: v_writelane_b32 v40, s46, 12 -; GISEL-NEXT: v_writelane_b32 v40, s47, 13 -; GISEL-NEXT: v_writelane_b32 v40, s48, 14 -; GISEL-NEXT: v_writelane_b32 v40, s49, 15 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s64, 14 +; GISEL-NEXT: v_writelane_b32 v40, s65, 15 ; GISEL-NEXT: v_writelane_b32 v40, s30, 16 ; GISEL-NEXT: v_writelane_b32 v40, s31, 17 -; GISEL-NEXT: s_mov_b32 s42, s15 -; GISEL-NEXT: s_mov_b32 s43, s14 -; GISEL-NEXT: s_mov_b32 s44, s13 -; GISEL-NEXT: s_mov_b32 s45, s12 +; GISEL-NEXT: s_mov_b32 s50, s15 +; GISEL-NEXT: s_mov_b32 s51, s14 +; GISEL-NEXT: s_mov_b32 s52, s13 +; GISEL-NEXT: s_mov_b32 s53, s12 ; GISEL-NEXT: s_mov_b64 s[34:35], s[10:11] ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] -; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] -; GISEL-NEXT: s_mov_b64 s[46:47], exec +; GISEL-NEXT: s_mov_b64 s[48:49], s[4:5] +; GISEL-NEXT: s_mov_b64 s[54:55], exec ; GISEL-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] +; GISEL-NEXT: s_and_saveexec_b64 s[64:65], vcc +; GISEL-NEXT: s_mov_b64 s[4:5], s[48:49] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] ; GISEL-NEXT: s_mov_b64 s[10:11], s[34:35] -; GISEL-NEXT: s_mov_b32 s12, s45 -; GISEL-NEXT: s_mov_b32 s13, s44 -; GISEL-NEXT: s_mov_b32 s14, s43 -; GISEL-NEXT: s_mov_b32 s15, s42 +; GISEL-NEXT: s_mov_b32 s12, s53 +; GISEL-NEXT: s_mov_b32 s13, s52 +; GISEL-NEXT: s_mov_b32 s14, s51 +; GISEL-NEXT: s_mov_b32 s15, s50 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49] +; GISEL-NEXT: s_xor_b64 exec, exec, s[64:65] ; GISEL-NEXT: s_cbranch_execnz .LBB2_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[46:47] +; GISEL-NEXT: s_mov_b64 exec, s[54:55] ; GISEL-NEXT: v_readlane_b32 s30, v40, 16 ; GISEL-NEXT: v_readlane_b32 s31, v40, 17 -; GISEL-NEXT: v_readlane_b32 s49, v40, 15 -; GISEL-NEXT: v_readlane_b32 s48, v40, 14 -; GISEL-NEXT: v_readlane_b32 s47, v40, 13 -; GISEL-NEXT: v_readlane_b32 s46, v40, 12 -; GISEL-NEXT: v_readlane_b32 s45, v40, 11 -; GISEL-NEXT: v_readlane_b32 s44, v40, 10 -; GISEL-NEXT: v_readlane_b32 s43, v40, 9 -; GISEL-NEXT: v_readlane_b32 s42, v40, 8 -; GISEL-NEXT: v_readlane_b32 s41, v40, 7 -; GISEL-NEXT: v_readlane_b32 s40, v40, 6 +; GISEL-NEXT: v_readlane_b32 s65, v40, 15 +; GISEL-NEXT: v_readlane_b32 s64, v40, 14 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 ; GISEL-NEXT: v_readlane_b32 s39, v40, 5 ; GISEL-NEXT: v_readlane_b32 s38, v40, 4 ; GISEL-NEXT: v_readlane_b32 s37, v40, 3 @@ -306,62 +306,62 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s37, 3 ; GCN-NEXT: v_writelane_b32 v40, s38, 4 ; GCN-NEXT: v_writelane_b32 v40, s39, 5 -; GCN-NEXT: v_writelane_b32 v40, s40, 6 -; GCN-NEXT: v_writelane_b32 v40, s41, 7 -; GCN-NEXT: v_writelane_b32 v40, s42, 8 -; GCN-NEXT: v_writelane_b32 v40, s43, 9 -; GCN-NEXT: v_writelane_b32 v40, s44, 10 -; GCN-NEXT: v_writelane_b32 v40, s45, 11 -; GCN-NEXT: v_writelane_b32 v40, s46, 12 -; GCN-NEXT: v_writelane_b32 v40, s47, 13 -; GCN-NEXT: v_writelane_b32 v40, s48, 14 -; GCN-NEXT: v_writelane_b32 v40, s49, 15 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s64, 14 +; GCN-NEXT: v_writelane_b32 v40, s65, 15 ; GCN-NEXT: v_writelane_b32 v40, s30, 16 ; GCN-NEXT: v_writelane_b32 v40, s31, 17 -; GCN-NEXT: s_mov_b32 s42, s15 -; GCN-NEXT: s_mov_b32 s43, s14 -; GCN-NEXT: s_mov_b32 s44, s13 -; GCN-NEXT: s_mov_b32 s45, s12 +; GCN-NEXT: s_mov_b32 s50, s15 +; GCN-NEXT: s_mov_b32 s51, s14 +; GCN-NEXT: s_mov_b32 s52, s13 +; GCN-NEXT: s_mov_b32 s53, s12 ; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] -; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] -; GCN-NEXT: s_mov_b64 s[46:47], exec +; GCN-NEXT: s_mov_b64 s[48:49], s[4:5] +; GCN-NEXT: s_mov_b64 s[54:55], exec ; GCN-NEXT: v_mov_b32_e32 v2, 0x7b ; GCN-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_and_saveexec_b64 s[64:65], vcc +; GCN-NEXT: s_mov_b64 s[4:5], s[48:49] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] ; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] -; GCN-NEXT: s_mov_b32 s12, s45 -; GCN-NEXT: s_mov_b32 s13, s44 -; GCN-NEXT: s_mov_b32 s14, s43 -; GCN-NEXT: s_mov_b32 s15, s42 +; GCN-NEXT: s_mov_b32 s12, s53 +; GCN-NEXT: s_mov_b32 s13, s52 +; GCN-NEXT: s_mov_b32 s14, s51 +; GCN-NEXT: s_mov_b32 s15, s50 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 ; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] +; GCN-NEXT: s_xor_b64 exec, exec, s[64:65] ; GCN-NEXT: s_cbranch_execnz .LBB3_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[46:47] +; GCN-NEXT: s_mov_b64 exec, s[54:55] ; GCN-NEXT: v_readlane_b32 s30, v40, 16 ; GCN-NEXT: v_readlane_b32 s31, v40, 17 -; GCN-NEXT: v_readlane_b32 s49, v40, 15 -; GCN-NEXT: v_readlane_b32 s48, v40, 14 -; GCN-NEXT: v_readlane_b32 s47, v40, 13 -; GCN-NEXT: v_readlane_b32 s46, v40, 12 -; GCN-NEXT: v_readlane_b32 s45, v40, 11 -; GCN-NEXT: v_readlane_b32 s44, v40, 10 -; GCN-NEXT: v_readlane_b32 s43, v40, 9 -; GCN-NEXT: v_readlane_b32 s42, v40, 8 -; GCN-NEXT: v_readlane_b32 s41, v40, 7 -; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s65, v40, 15 +; GCN-NEXT: v_readlane_b32 s64, v40, 14 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 ; GCN-NEXT: v_readlane_b32 s39, v40, 5 ; GCN-NEXT: v_readlane_b32 s38, v40, 4 ; GCN-NEXT: v_readlane_b32 s37, v40, 3 @@ -393,60 +393,60 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 3 ; GISEL-NEXT: v_writelane_b32 v40, s38, 4 ; GISEL-NEXT: v_writelane_b32 v40, s39, 5 -; GISEL-NEXT: v_writelane_b32 v40, s40, 6 -; GISEL-NEXT: v_writelane_b32 v40, s41, 7 -; GISEL-NEXT: v_writelane_b32 v40, s42, 8 -; GISEL-NEXT: v_writelane_b32 v40, s43, 9 -; GISEL-NEXT: v_writelane_b32 v40, s44, 10 -; GISEL-NEXT: v_writelane_b32 v40, s45, 11 -; GISEL-NEXT: v_writelane_b32 v40, s46, 12 -; GISEL-NEXT: v_writelane_b32 v40, s47, 13 -; GISEL-NEXT: v_writelane_b32 v40, s48, 14 -; GISEL-NEXT: v_writelane_b32 v40, s49, 15 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s64, 14 +; GISEL-NEXT: v_writelane_b32 v40, s65, 15 ; GISEL-NEXT: v_writelane_b32 v40, s30, 16 ; GISEL-NEXT: v_writelane_b32 v40, s31, 17 -; GISEL-NEXT: s_mov_b32 s42, s15 -; GISEL-NEXT: s_mov_b32 s43, s14 -; GISEL-NEXT: s_mov_b32 s44, s13 -; GISEL-NEXT: s_mov_b32 s45, s12 +; GISEL-NEXT: s_mov_b32 s50, s15 +; GISEL-NEXT: s_mov_b32 s51, s14 +; GISEL-NEXT: s_mov_b32 s52, s13 +; GISEL-NEXT: s_mov_b32 s53, s12 ; GISEL-NEXT: s_mov_b64 s[34:35], s[10:11] ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] -; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] -; GISEL-NEXT: s_mov_b64 s[46:47], exec +; GISEL-NEXT: s_mov_b64 s[48:49], s[4:5] +; GISEL-NEXT: s_mov_b64 s[54:55], exec ; GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[64:65], vcc ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b -; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] +; GISEL-NEXT: s_mov_b64 s[4:5], s[48:49] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] ; GISEL-NEXT: s_mov_b64 s[10:11], s[34:35] -; GISEL-NEXT: s_mov_b32 s12, s45 -; GISEL-NEXT: s_mov_b32 s13, s44 -; GISEL-NEXT: s_mov_b32 s14, s43 -; GISEL-NEXT: s_mov_b32 s15, s42 +; GISEL-NEXT: s_mov_b32 s12, s53 +; GISEL-NEXT: s_mov_b32 s13, s52 +; GISEL-NEXT: s_mov_b32 s14, s51 +; GISEL-NEXT: s_mov_b32 s15, s50 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49] +; GISEL-NEXT: s_xor_b64 exec, exec, s[64:65] ; GISEL-NEXT: s_cbranch_execnz .LBB3_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[46:47] +; GISEL-NEXT: s_mov_b64 exec, s[54:55] ; GISEL-NEXT: v_readlane_b32 s30, v40, 16 ; GISEL-NEXT: v_readlane_b32 s31, v40, 17 -; GISEL-NEXT: v_readlane_b32 s49, v40, 15 -; GISEL-NEXT: v_readlane_b32 s48, v40, 14 -; GISEL-NEXT: v_readlane_b32 s47, v40, 13 -; GISEL-NEXT: v_readlane_b32 s46, v40, 12 -; GISEL-NEXT: v_readlane_b32 s45, v40, 11 -; GISEL-NEXT: v_readlane_b32 s44, v40, 10 -; GISEL-NEXT: v_readlane_b32 s43, v40, 9 -; GISEL-NEXT: v_readlane_b32 s42, v40, 8 -; GISEL-NEXT: v_readlane_b32 s41, v40, 7 -; GISEL-NEXT: v_readlane_b32 s40, v40, 6 +; GISEL-NEXT: v_readlane_b32 s65, v40, 15 +; GISEL-NEXT: v_readlane_b32 s64, v40, 14 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 ; GISEL-NEXT: v_readlane_b32 s39, v40, 5 ; GISEL-NEXT: v_readlane_b32 s38, v40, 4 ; GISEL-NEXT: v_readlane_b32 s37, v40, 3 @@ -482,61 +482,61 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s37, 3 ; GCN-NEXT: v_writelane_b32 v40, s38, 4 ; GCN-NEXT: v_writelane_b32 v40, s39, 5 -; GCN-NEXT: v_writelane_b32 v40, s40, 6 -; GCN-NEXT: v_writelane_b32 v40, s41, 7 -; GCN-NEXT: v_writelane_b32 v40, s42, 8 -; GCN-NEXT: v_writelane_b32 v40, s43, 9 -; GCN-NEXT: v_writelane_b32 v40, s44, 10 -; GCN-NEXT: v_writelane_b32 v40, s45, 11 -; GCN-NEXT: v_writelane_b32 v40, s46, 12 -; GCN-NEXT: v_writelane_b32 v40, s47, 13 -; GCN-NEXT: v_writelane_b32 v40, s48, 14 -; GCN-NEXT: v_writelane_b32 v40, s49, 15 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s64, 14 +; GCN-NEXT: v_writelane_b32 v40, s65, 15 ; GCN-NEXT: v_writelane_b32 v40, s30, 16 ; GCN-NEXT: v_writelane_b32 v40, s31, 17 -; GCN-NEXT: s_mov_b32 s42, s15 -; GCN-NEXT: s_mov_b32 s43, s14 -; GCN-NEXT: s_mov_b32 s44, s13 -; GCN-NEXT: s_mov_b32 s45, s12 +; GCN-NEXT: s_mov_b32 s50, s15 +; GCN-NEXT: s_mov_b32 s51, s14 +; GCN-NEXT: s_mov_b32 s52, s13 +; GCN-NEXT: s_mov_b32 s53, s12 ; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] -; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] -; GCN-NEXT: s_mov_b64 s[46:47], exec +; GCN-NEXT: s_mov_b64 s[48:49], s[4:5] +; GCN-NEXT: s_mov_b64 s[54:55], exec ; GCN-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_and_saveexec_b64 s[64:65], vcc +; GCN-NEXT: s_mov_b64 s[4:5], s[48:49] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] ; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] -; GCN-NEXT: s_mov_b32 s12, s45 -; GCN-NEXT: s_mov_b32 s13, s44 -; GCN-NEXT: s_mov_b32 s14, s43 -; GCN-NEXT: s_mov_b32 s15, s42 +; GCN-NEXT: s_mov_b32 s12, s53 +; GCN-NEXT: s_mov_b32 s13, s52 +; GCN-NEXT: s_mov_b32 s14, s51 +; GCN-NEXT: s_mov_b32 s15, s50 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] +; GCN-NEXT: s_xor_b64 exec, exec, s[64:65] ; GCN-NEXT: s_cbranch_execnz .LBB4_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[46:47] +; GCN-NEXT: s_mov_b64 exec, s[54:55] ; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v2 ; GCN-NEXT: v_readlane_b32 s30, v40, 16 ; GCN-NEXT: v_readlane_b32 s31, v40, 17 -; GCN-NEXT: v_readlane_b32 s49, v40, 15 -; GCN-NEXT: v_readlane_b32 s48, v40, 14 -; GCN-NEXT: v_readlane_b32 s47, v40, 13 -; GCN-NEXT: v_readlane_b32 s46, v40, 12 -; GCN-NEXT: v_readlane_b32 s45, v40, 11 -; GCN-NEXT: v_readlane_b32 s44, v40, 10 -; GCN-NEXT: v_readlane_b32 s43, v40, 9 -; GCN-NEXT: v_readlane_b32 s42, v40, 8 -; GCN-NEXT: v_readlane_b32 s41, v40, 7 -; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s65, v40, 15 +; GCN-NEXT: v_readlane_b32 s64, v40, 14 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 ; GCN-NEXT: v_readlane_b32 s39, v40, 5 ; GCN-NEXT: v_readlane_b32 s38, v40, 4 ; GCN-NEXT: v_readlane_b32 s37, v40, 3 @@ -568,61 +568,61 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 3 ; GISEL-NEXT: v_writelane_b32 v40, s38, 4 ; GISEL-NEXT: v_writelane_b32 v40, s39, 5 -; GISEL-NEXT: v_writelane_b32 v40, s40, 6 -; GISEL-NEXT: v_writelane_b32 v40, s41, 7 -; GISEL-NEXT: v_writelane_b32 v40, s42, 8 -; GISEL-NEXT: v_writelane_b32 v40, s43, 9 -; GISEL-NEXT: v_writelane_b32 v40, s44, 10 -; GISEL-NEXT: v_writelane_b32 v40, s45, 11 -; GISEL-NEXT: v_writelane_b32 v40, s46, 12 -; GISEL-NEXT: v_writelane_b32 v40, s47, 13 -; GISEL-NEXT: v_writelane_b32 v40, s48, 14 -; GISEL-NEXT: v_writelane_b32 v40, s49, 15 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s64, 14 +; GISEL-NEXT: v_writelane_b32 v40, s65, 15 ; GISEL-NEXT: v_writelane_b32 v40, s30, 16 ; GISEL-NEXT: v_writelane_b32 v40, s31, 17 -; GISEL-NEXT: s_mov_b32 s42, s15 -; GISEL-NEXT: s_mov_b32 s43, s14 -; GISEL-NEXT: s_mov_b32 s44, s13 -; GISEL-NEXT: s_mov_b32 s45, s12 +; GISEL-NEXT: s_mov_b32 s50, s15 +; GISEL-NEXT: s_mov_b32 s51, s14 +; GISEL-NEXT: s_mov_b32 s52, s13 +; GISEL-NEXT: s_mov_b32 s53, s12 ; GISEL-NEXT: s_mov_b64 s[34:35], s[10:11] ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] -; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] -; GISEL-NEXT: s_mov_b64 s[46:47], exec +; GISEL-NEXT: s_mov_b64 s[48:49], s[4:5] +; GISEL-NEXT: s_mov_b64 s[54:55], exec ; GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] +; GISEL-NEXT: s_and_saveexec_b64 s[64:65], vcc +; GISEL-NEXT: s_mov_b64 s[4:5], s[48:49] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] ; GISEL-NEXT: s_mov_b64 s[10:11], s[34:35] -; GISEL-NEXT: s_mov_b32 s12, s45 -; GISEL-NEXT: s_mov_b32 s13, s44 -; GISEL-NEXT: s_mov_b32 s14, s43 -; GISEL-NEXT: s_mov_b32 s15, s42 +; GISEL-NEXT: s_mov_b32 s12, s53 +; GISEL-NEXT: s_mov_b32 s13, s52 +; GISEL-NEXT: s_mov_b32 s14, s51 +; GISEL-NEXT: s_mov_b32 s15, s50 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49] +; GISEL-NEXT: s_xor_b64 exec, exec, s[64:65] ; GISEL-NEXT: s_cbranch_execnz .LBB4_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[46:47] +; GISEL-NEXT: s_mov_b64 exec, s[54:55] ; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 16 ; GISEL-NEXT: v_readlane_b32 s31, v40, 17 -; GISEL-NEXT: v_readlane_b32 s49, v40, 15 -; GISEL-NEXT: v_readlane_b32 s48, v40, 14 -; GISEL-NEXT: v_readlane_b32 s47, v40, 13 -; GISEL-NEXT: v_readlane_b32 s46, v40, 12 -; GISEL-NEXT: v_readlane_b32 s45, v40, 11 -; GISEL-NEXT: v_readlane_b32 s44, v40, 10 -; GISEL-NEXT: v_readlane_b32 s43, v40, 9 -; GISEL-NEXT: v_readlane_b32 s42, v40, 8 -; GISEL-NEXT: v_readlane_b32 s41, v40, 7 -; GISEL-NEXT: v_readlane_b32 s40, v40, 6 +; GISEL-NEXT: v_readlane_b32 s65, v40, 15 +; GISEL-NEXT: v_readlane_b32 s64, v40, 14 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 ; GISEL-NEXT: v_readlane_b32 s39, v40, 5 ; GISEL-NEXT: v_readlane_b32 s38, v40, 4 ; GISEL-NEXT: v_readlane_b32 s37, v40, 3 @@ -659,70 +659,70 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: v_writelane_b32 v40, s37, 3 ; GCN-NEXT: v_writelane_b32 v40, s38, 4 ; GCN-NEXT: v_writelane_b32 v40, s39, 5 -; GCN-NEXT: v_writelane_b32 v40, s40, 6 -; GCN-NEXT: v_writelane_b32 v40, s41, 7 -; GCN-NEXT: v_writelane_b32 v40, s42, 8 -; GCN-NEXT: v_writelane_b32 v40, s43, 9 -; GCN-NEXT: v_writelane_b32 v40, s44, 10 -; GCN-NEXT: v_writelane_b32 v40, s45, 11 -; GCN-NEXT: v_writelane_b32 v40, s46, 12 -; GCN-NEXT: v_writelane_b32 v40, s47, 13 -; GCN-NEXT: v_writelane_b32 v40, s48, 14 -; GCN-NEXT: v_writelane_b32 v40, s49, 15 -; GCN-NEXT: v_writelane_b32 v40, s50, 16 -; GCN-NEXT: v_writelane_b32 v40, s51, 17 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s64, 14 +; GCN-NEXT: v_writelane_b32 v40, s65, 15 +; GCN-NEXT: v_writelane_b32 v40, s66, 16 +; GCN-NEXT: v_writelane_b32 v40, s67, 17 ; GCN-NEXT: v_writelane_b32 v40, s30, 18 ; GCN-NEXT: v_writelane_b32 v40, s31, 19 -; GCN-NEXT: s_mov_b32 s42, s15 -; GCN-NEXT: s_mov_b32 s43, s14 -; GCN-NEXT: s_mov_b32 s44, s13 -; GCN-NEXT: s_mov_b32 s45, s12 +; GCN-NEXT: s_mov_b32 s50, s15 +; GCN-NEXT: s_mov_b32 s51, s14 +; GCN-NEXT: s_mov_b32 s52, s13 +; GCN-NEXT: s_mov_b32 s53, s12 ; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] -; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] +; GCN-NEXT: s_mov_b64 s[48:49], s[4:5] ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc +; GCN-NEXT: s_and_saveexec_b64 s[54:55], vcc ; GCN-NEXT: s_cbranch_execz .LBB5_4 ; GCN-NEXT: ; %bb.1: ; %bb1 -; GCN-NEXT: s_mov_b64 s[48:49], exec +; GCN-NEXT: s_mov_b64 s[64:65], exec ; GCN-NEXT: .LBB5_2: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[50:51], vcc -; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_and_saveexec_b64 s[66:67], vcc +; GCN-NEXT: s_mov_b64 s[4:5], s[48:49] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] ; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] -; GCN-NEXT: s_mov_b32 s12, s45 -; GCN-NEXT: s_mov_b32 s13, s44 -; GCN-NEXT: s_mov_b32 s14, s43 -; GCN-NEXT: s_mov_b32 s15, s42 +; GCN-NEXT: s_mov_b32 s12, s53 +; GCN-NEXT: s_mov_b32 s13, s52 +; GCN-NEXT: s_mov_b32 s14, s51 +; GCN-NEXT: s_mov_b32 s15, s50 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_xor_b64 exec, exec, s[50:51] +; GCN-NEXT: s_xor_b64 exec, exec, s[66:67] ; GCN-NEXT: s_cbranch_execnz .LBB5_2 ; GCN-NEXT: ; %bb.3: -; GCN-NEXT: s_mov_b64 exec, s[48:49] +; GCN-NEXT: s_mov_b64 exec, s[64:65] ; GCN-NEXT: .LBB5_4: ; %bb2 -; GCN-NEXT: s_or_b64 exec, exec, s[46:47] +; GCN-NEXT: s_or_b64 exec, exec, s[54:55] ; GCN-NEXT: v_readlane_b32 s30, v40, 18 ; GCN-NEXT: v_readlane_b32 s31, v40, 19 -; GCN-NEXT: v_readlane_b32 s51, v40, 17 -; GCN-NEXT: v_readlane_b32 s50, v40, 16 -; GCN-NEXT: v_readlane_b32 s49, v40, 15 -; GCN-NEXT: v_readlane_b32 s48, v40, 14 -; GCN-NEXT: v_readlane_b32 s47, v40, 13 -; GCN-NEXT: v_readlane_b32 s46, v40, 12 -; GCN-NEXT: v_readlane_b32 s45, v40, 11 -; GCN-NEXT: v_readlane_b32 s44, v40, 10 -; GCN-NEXT: v_readlane_b32 s43, v40, 9 -; GCN-NEXT: v_readlane_b32 s42, v40, 8 -; GCN-NEXT: v_readlane_b32 s41, v40, 7 -; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s67, v40, 17 +; GCN-NEXT: v_readlane_b32 s66, v40, 16 +; GCN-NEXT: v_readlane_b32 s65, v40, 15 +; GCN-NEXT: v_readlane_b32 s64, v40, 14 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 ; GCN-NEXT: v_readlane_b32 s39, v40, 5 ; GCN-NEXT: v_readlane_b32 s38, v40, 4 ; GCN-NEXT: v_readlane_b32 s37, v40, 3 @@ -754,70 +754,70 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 3 ; GISEL-NEXT: v_writelane_b32 v40, s38, 4 ; GISEL-NEXT: v_writelane_b32 v40, s39, 5 -; GISEL-NEXT: v_writelane_b32 v40, s40, 6 -; GISEL-NEXT: v_writelane_b32 v40, s41, 7 -; GISEL-NEXT: v_writelane_b32 v40, s42, 8 -; GISEL-NEXT: v_writelane_b32 v40, s43, 9 -; GISEL-NEXT: v_writelane_b32 v40, s44, 10 -; GISEL-NEXT: v_writelane_b32 v40, s45, 11 -; GISEL-NEXT: v_writelane_b32 v40, s46, 12 -; GISEL-NEXT: v_writelane_b32 v40, s47, 13 -; GISEL-NEXT: v_writelane_b32 v40, s48, 14 -; GISEL-NEXT: v_writelane_b32 v40, s49, 15 -; GISEL-NEXT: v_writelane_b32 v40, s50, 16 -; GISEL-NEXT: v_writelane_b32 v40, s51, 17 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s64, 14 +; GISEL-NEXT: v_writelane_b32 v40, s65, 15 +; GISEL-NEXT: v_writelane_b32 v40, s66, 16 +; GISEL-NEXT: v_writelane_b32 v40, s67, 17 ; GISEL-NEXT: v_writelane_b32 v40, s30, 18 ; GISEL-NEXT: v_writelane_b32 v40, s31, 19 -; GISEL-NEXT: s_mov_b32 s42, s15 -; GISEL-NEXT: s_mov_b32 s43, s14 -; GISEL-NEXT: s_mov_b32 s44, s13 -; GISEL-NEXT: s_mov_b32 s45, s12 +; GISEL-NEXT: s_mov_b32 s50, s15 +; GISEL-NEXT: s_mov_b32 s51, s14 +; GISEL-NEXT: s_mov_b32 s52, s13 +; GISEL-NEXT: s_mov_b32 s53, s12 ; GISEL-NEXT: s_mov_b64 s[34:35], s[10:11] ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] -; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] +; GISEL-NEXT: s_mov_b64 s[48:49], s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, 1, v2 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: s_and_saveexec_b64 s[46:47], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[54:55], vcc ; GISEL-NEXT: s_cbranch_execz .LBB5_4 ; GISEL-NEXT: ; %bb.1: ; %bb1 -; GISEL-NEXT: s_mov_b64 s[48:49], exec +; GISEL-NEXT: s_mov_b64 s[64:65], exec ; GISEL-NEXT: .LBB5_2: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[50:51], vcc -; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] +; GISEL-NEXT: s_and_saveexec_b64 s[66:67], vcc +; GISEL-NEXT: s_mov_b64 s[4:5], s[48:49] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] ; GISEL-NEXT: s_mov_b64 s[10:11], s[34:35] -; GISEL-NEXT: s_mov_b32 s12, s45 -; GISEL-NEXT: s_mov_b32 s13, s44 -; GISEL-NEXT: s_mov_b32 s14, s43 -; GISEL-NEXT: s_mov_b32 s15, s42 +; GISEL-NEXT: s_mov_b32 s12, s53 +; GISEL-NEXT: s_mov_b32 s13, s52 +; GISEL-NEXT: s_mov_b32 s14, s51 +; GISEL-NEXT: s_mov_b32 s15, s50 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[50:51] +; GISEL-NEXT: s_xor_b64 exec, exec, s[66:67] ; GISEL-NEXT: s_cbranch_execnz .LBB5_2 ; GISEL-NEXT: ; %bb.3: -; GISEL-NEXT: s_mov_b64 exec, s[48:49] +; GISEL-NEXT: s_mov_b64 exec, s[64:65] ; GISEL-NEXT: .LBB5_4: ; %bb2 -; GISEL-NEXT: s_or_b64 exec, exec, s[46:47] +; GISEL-NEXT: s_or_b64 exec, exec, s[54:55] ; GISEL-NEXT: v_readlane_b32 s30, v40, 18 ; GISEL-NEXT: v_readlane_b32 s31, v40, 19 -; GISEL-NEXT: v_readlane_b32 s51, v40, 17 -; GISEL-NEXT: v_readlane_b32 s50, v40, 16 -; GISEL-NEXT: v_readlane_b32 s49, v40, 15 -; GISEL-NEXT: v_readlane_b32 s48, v40, 14 -; GISEL-NEXT: v_readlane_b32 s47, v40, 13 -; GISEL-NEXT: v_readlane_b32 s46, v40, 12 -; GISEL-NEXT: v_readlane_b32 s45, v40, 11 -; GISEL-NEXT: v_readlane_b32 s44, v40, 10 -; GISEL-NEXT: v_readlane_b32 s43, v40, 9 -; GISEL-NEXT: v_readlane_b32 s42, v40, 8 -; GISEL-NEXT: v_readlane_b32 s41, v40, 7 -; GISEL-NEXT: v_readlane_b32 s40, v40, 6 +; GISEL-NEXT: v_readlane_b32 s67, v40, 17 +; GISEL-NEXT: v_readlane_b32 s66, v40, 16 +; GISEL-NEXT: v_readlane_b32 s65, v40, 15 +; GISEL-NEXT: v_readlane_b32 s64, v40, 14 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 ; GISEL-NEXT: v_readlane_b32 s39, v40, 5 ; GISEL-NEXT: v_readlane_b32 s38, v40, 4 ; GISEL-NEXT: v_readlane_b32 s37, v40, 3 @@ -859,32 +859,16 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s37, 3 ; GCN-NEXT: v_writelane_b32 v40, s38, 4 ; GCN-NEXT: v_writelane_b32 v40, s39, 5 -; GCN-NEXT: v_writelane_b32 v40, s40, 6 -; GCN-NEXT: v_writelane_b32 v40, s41, 7 -; GCN-NEXT: v_writelane_b32 v40, s42, 8 -; GCN-NEXT: v_writelane_b32 v40, s43, 9 -; GCN-NEXT: v_writelane_b32 v40, s44, 10 -; GCN-NEXT: v_writelane_b32 v40, s45, 11 -; GCN-NEXT: v_writelane_b32 v40, s46, 12 -; GCN-NEXT: v_writelane_b32 v40, s47, 13 -; GCN-NEXT: v_writelane_b32 v40, s48, 14 -; GCN-NEXT: v_writelane_b32 v40, s49, 15 -; GCN-NEXT: v_writelane_b32 v40, s50, 16 -; GCN-NEXT: v_writelane_b32 v40, s51, 17 -; GCN-NEXT: v_writelane_b32 v40, s52, 18 -; GCN-NEXT: v_writelane_b32 v40, s53, 19 -; GCN-NEXT: v_writelane_b32 v40, s54, 20 -; GCN-NEXT: v_writelane_b32 v40, s55, 21 -; GCN-NEXT: v_writelane_b32 v40, s56, 22 -; GCN-NEXT: v_writelane_b32 v40, s57, 23 -; GCN-NEXT: v_writelane_b32 v40, s58, 24 -; GCN-NEXT: v_writelane_b32 v40, s59, 25 -; GCN-NEXT: v_writelane_b32 v40, s60, 26 -; GCN-NEXT: v_writelane_b32 v40, s61, 27 -; GCN-NEXT: v_writelane_b32 v40, s62, 28 -; GCN-NEXT: v_writelane_b32 v40, s63, 29 -; GCN-NEXT: v_writelane_b32 v40, s30, 30 -; GCN-NEXT: v_writelane_b32 v40, s31, 31 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s30, 14 +; GCN-NEXT: v_writelane_b32 v40, s31, 15 ; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 @@ -898,32 +882,16 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: s_cbranch_execnz .LBB6_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_readlane_b32 s30, v40, 30 -; GCN-NEXT: v_readlane_b32 s31, v40, 31 -; GCN-NEXT: v_readlane_b32 s63, v40, 29 -; GCN-NEXT: v_readlane_b32 s62, v40, 28 -; GCN-NEXT: v_readlane_b32 s61, v40, 27 -; GCN-NEXT: v_readlane_b32 s60, v40, 26 -; GCN-NEXT: v_readlane_b32 s59, v40, 25 -; GCN-NEXT: v_readlane_b32 s58, v40, 24 -; GCN-NEXT: v_readlane_b32 s57, v40, 23 -; GCN-NEXT: v_readlane_b32 s56, v40, 22 -; GCN-NEXT: v_readlane_b32 s55, v40, 21 -; GCN-NEXT: v_readlane_b32 s54, v40, 20 -; GCN-NEXT: v_readlane_b32 s53, v40, 19 -; GCN-NEXT: v_readlane_b32 s52, v40, 18 -; GCN-NEXT: v_readlane_b32 s51, v40, 17 -; GCN-NEXT: v_readlane_b32 s50, v40, 16 -; GCN-NEXT: v_readlane_b32 s49, v40, 15 -; GCN-NEXT: v_readlane_b32 s48, v40, 14 -; GCN-NEXT: v_readlane_b32 s47, v40, 13 -; GCN-NEXT: v_readlane_b32 s46, v40, 12 -; GCN-NEXT: v_readlane_b32 s45, v40, 11 -; GCN-NEXT: v_readlane_b32 s44, v40, 10 -; GCN-NEXT: v_readlane_b32 s43, v40, 9 -; GCN-NEXT: v_readlane_b32 s42, v40, 8 -; GCN-NEXT: v_readlane_b32 s41, v40, 7 -; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s30, v40, 14 +; GCN-NEXT: v_readlane_b32 s31, v40, 15 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 ; GCN-NEXT: v_readlane_b32 s39, v40, 5 ; GCN-NEXT: v_readlane_b32 s38, v40, 4 ; GCN-NEXT: v_readlane_b32 s37, v40, 3 @@ -953,32 +921,16 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 3 ; GISEL-NEXT: v_writelane_b32 v40, s38, 4 ; GISEL-NEXT: v_writelane_b32 v40, s39, 5 -; GISEL-NEXT: v_writelane_b32 v40, s40, 6 -; GISEL-NEXT: v_writelane_b32 v40, s41, 7 -; GISEL-NEXT: v_writelane_b32 v40, s42, 8 -; GISEL-NEXT: v_writelane_b32 v40, s43, 9 -; GISEL-NEXT: v_writelane_b32 v40, s44, 10 -; GISEL-NEXT: v_writelane_b32 v40, s45, 11 -; GISEL-NEXT: v_writelane_b32 v40, s46, 12 -; GISEL-NEXT: v_writelane_b32 v40, s47, 13 -; GISEL-NEXT: v_writelane_b32 v40, s48, 14 -; GISEL-NEXT: v_writelane_b32 v40, s49, 15 -; GISEL-NEXT: v_writelane_b32 v40, s50, 16 -; GISEL-NEXT: v_writelane_b32 v40, s51, 17 -; GISEL-NEXT: v_writelane_b32 v40, s52, 18 -; GISEL-NEXT: v_writelane_b32 v40, s53, 19 -; GISEL-NEXT: v_writelane_b32 v40, s54, 20 -; GISEL-NEXT: v_writelane_b32 v40, s55, 21 -; GISEL-NEXT: v_writelane_b32 v40, s56, 22 -; GISEL-NEXT: v_writelane_b32 v40, s57, 23 -; GISEL-NEXT: v_writelane_b32 v40, s58, 24 -; GISEL-NEXT: v_writelane_b32 v40, s59, 25 -; GISEL-NEXT: v_writelane_b32 v40, s60, 26 -; GISEL-NEXT: v_writelane_b32 v40, s61, 27 -; GISEL-NEXT: v_writelane_b32 v40, s62, 28 -; GISEL-NEXT: v_writelane_b32 v40, s63, 29 -; GISEL-NEXT: v_writelane_b32 v40, s30, 30 -; GISEL-NEXT: v_writelane_b32 v40, s31, 31 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s30, 14 +; GISEL-NEXT: v_writelane_b32 v40, s31, 15 ; GISEL-NEXT: s_mov_b64 s[6:7], exec ; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s8, v0 @@ -992,32 +944,16 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: s_cbranch_execnz .LBB6_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[6:7] -; GISEL-NEXT: v_readlane_b32 s30, v40, 30 -; GISEL-NEXT: v_readlane_b32 s31, v40, 31 -; GISEL-NEXT: v_readlane_b32 s63, v40, 29 -; GISEL-NEXT: v_readlane_b32 s62, v40, 28 -; GISEL-NEXT: v_readlane_b32 s61, v40, 27 -; GISEL-NEXT: v_readlane_b32 s60, v40, 26 -; GISEL-NEXT: v_readlane_b32 s59, v40, 25 -; GISEL-NEXT: v_readlane_b32 s58, v40, 24 -; GISEL-NEXT: v_readlane_b32 s57, v40, 23 -; GISEL-NEXT: v_readlane_b32 s56, v40, 22 -; GISEL-NEXT: v_readlane_b32 s55, v40, 21 -; GISEL-NEXT: v_readlane_b32 s54, v40, 20 -; GISEL-NEXT: v_readlane_b32 s53, v40, 19 -; GISEL-NEXT: v_readlane_b32 s52, v40, 18 -; GISEL-NEXT: v_readlane_b32 s51, v40, 17 -; GISEL-NEXT: v_readlane_b32 s50, v40, 16 -; GISEL-NEXT: v_readlane_b32 s49, v40, 15 -; GISEL-NEXT: v_readlane_b32 s48, v40, 14 -; GISEL-NEXT: v_readlane_b32 s47, v40, 13 -; GISEL-NEXT: v_readlane_b32 s46, v40, 12 -; GISEL-NEXT: v_readlane_b32 s45, v40, 11 -; GISEL-NEXT: v_readlane_b32 s44, v40, 10 -; GISEL-NEXT: v_readlane_b32 s43, v40, 9 -; GISEL-NEXT: v_readlane_b32 s42, v40, 8 -; GISEL-NEXT: v_readlane_b32 s41, v40, 7 -; GISEL-NEXT: v_readlane_b32 s40, v40, 6 +; GISEL-NEXT: v_readlane_b32 s30, v40, 14 +; GISEL-NEXT: v_readlane_b32 s31, v40, 15 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 ; GISEL-NEXT: v_readlane_b32 s39, v40, 5 ; GISEL-NEXT: v_readlane_b32 s38, v40, 4 ; GISEL-NEXT: v_readlane_b32 s37, v40, 3 @@ -1052,32 +988,16 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v41, s37, 3 ; GCN-NEXT: v_writelane_b32 v41, s38, 4 ; GCN-NEXT: v_writelane_b32 v41, s39, 5 -; GCN-NEXT: v_writelane_b32 v41, s40, 6 -; GCN-NEXT: v_writelane_b32 v41, s41, 7 -; GCN-NEXT: v_writelane_b32 v41, s42, 8 -; GCN-NEXT: v_writelane_b32 v41, s43, 9 -; GCN-NEXT: v_writelane_b32 v41, s44, 10 -; GCN-NEXT: v_writelane_b32 v41, s45, 11 -; GCN-NEXT: v_writelane_b32 v41, s46, 12 -; GCN-NEXT: v_writelane_b32 v41, s47, 13 -; GCN-NEXT: v_writelane_b32 v41, s48, 14 -; GCN-NEXT: v_writelane_b32 v41, s49, 15 -; GCN-NEXT: v_writelane_b32 v41, s50, 16 -; GCN-NEXT: v_writelane_b32 v41, s51, 17 -; GCN-NEXT: v_writelane_b32 v41, s52, 18 -; GCN-NEXT: v_writelane_b32 v41, s53, 19 -; GCN-NEXT: v_writelane_b32 v41, s54, 20 -; GCN-NEXT: v_writelane_b32 v41, s55, 21 -; GCN-NEXT: v_writelane_b32 v41, s56, 22 -; GCN-NEXT: v_writelane_b32 v41, s57, 23 -; GCN-NEXT: v_writelane_b32 v41, s58, 24 -; GCN-NEXT: v_writelane_b32 v41, s59, 25 -; GCN-NEXT: v_writelane_b32 v41, s60, 26 -; GCN-NEXT: v_writelane_b32 v41, s61, 27 -; GCN-NEXT: v_writelane_b32 v41, s62, 28 -; GCN-NEXT: v_writelane_b32 v41, s63, 29 -; GCN-NEXT: v_writelane_b32 v41, s30, 30 -; GCN-NEXT: v_writelane_b32 v41, s31, 31 +; GCN-NEXT: v_writelane_b32 v41, s48, 6 +; GCN-NEXT: v_writelane_b32 v41, s49, 7 +; GCN-NEXT: v_writelane_b32 v41, s50, 8 +; GCN-NEXT: v_writelane_b32 v41, s51, 9 +; GCN-NEXT: v_writelane_b32 v41, s52, 10 +; GCN-NEXT: v_writelane_b32 v41, s53, 11 +; GCN-NEXT: v_writelane_b32 v41, s54, 12 +; GCN-NEXT: v_writelane_b32 v41, s55, 13 +; GCN-NEXT: v_writelane_b32 v41, s30, 14 +; GCN-NEXT: v_writelane_b32 v41, s31, 15 ; GCN-NEXT: v_mov_b32_e32 v40, v0 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 @@ -1093,32 +1013,16 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v40 -; GCN-NEXT: v_readlane_b32 s30, v41, 30 -; GCN-NEXT: v_readlane_b32 s31, v41, 31 -; GCN-NEXT: v_readlane_b32 s63, v41, 29 -; GCN-NEXT: v_readlane_b32 s62, v41, 28 -; GCN-NEXT: v_readlane_b32 s61, v41, 27 -; GCN-NEXT: v_readlane_b32 s60, v41, 26 -; GCN-NEXT: v_readlane_b32 s59, v41, 25 -; GCN-NEXT: v_readlane_b32 s58, v41, 24 -; GCN-NEXT: v_readlane_b32 s57, v41, 23 -; GCN-NEXT: v_readlane_b32 s56, v41, 22 -; GCN-NEXT: v_readlane_b32 s55, v41, 21 -; GCN-NEXT: v_readlane_b32 s54, v41, 20 -; GCN-NEXT: v_readlane_b32 s53, v41, 19 -; GCN-NEXT: v_readlane_b32 s52, v41, 18 -; GCN-NEXT: v_readlane_b32 s51, v41, 17 -; GCN-NEXT: v_readlane_b32 s50, v41, 16 -; GCN-NEXT: v_readlane_b32 s49, v41, 15 -; GCN-NEXT: v_readlane_b32 s48, v41, 14 -; GCN-NEXT: v_readlane_b32 s47, v41, 13 -; GCN-NEXT: v_readlane_b32 s46, v41, 12 -; GCN-NEXT: v_readlane_b32 s45, v41, 11 -; GCN-NEXT: v_readlane_b32 s44, v41, 10 -; GCN-NEXT: v_readlane_b32 s43, v41, 9 -; GCN-NEXT: v_readlane_b32 s42, v41, 8 -; GCN-NEXT: v_readlane_b32 s41, v41, 7 -; GCN-NEXT: v_readlane_b32 s40, v41, 6 +; GCN-NEXT: v_readlane_b32 s30, v41, 14 +; GCN-NEXT: v_readlane_b32 s31, v41, 15 +; GCN-NEXT: v_readlane_b32 s55, v41, 13 +; GCN-NEXT: v_readlane_b32 s54, v41, 12 +; GCN-NEXT: v_readlane_b32 s53, v41, 11 +; GCN-NEXT: v_readlane_b32 s52, v41, 10 +; GCN-NEXT: v_readlane_b32 s51, v41, 9 +; GCN-NEXT: v_readlane_b32 s50, v41, 8 +; GCN-NEXT: v_readlane_b32 s49, v41, 7 +; GCN-NEXT: v_readlane_b32 s48, v41, 6 ; GCN-NEXT: v_readlane_b32 s39, v41, 5 ; GCN-NEXT: v_readlane_b32 s38, v41, 4 ; GCN-NEXT: v_readlane_b32 s37, v41, 3 @@ -1150,32 +1054,16 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v41, s37, 3 ; GISEL-NEXT: v_writelane_b32 v41, s38, 4 ; GISEL-NEXT: v_writelane_b32 v41, s39, 5 -; GISEL-NEXT: v_writelane_b32 v41, s40, 6 -; GISEL-NEXT: v_writelane_b32 v41, s41, 7 -; GISEL-NEXT: v_writelane_b32 v41, s42, 8 -; GISEL-NEXT: v_writelane_b32 v41, s43, 9 -; GISEL-NEXT: v_writelane_b32 v41, s44, 10 -; GISEL-NEXT: v_writelane_b32 v41, s45, 11 -; GISEL-NEXT: v_writelane_b32 v41, s46, 12 -; GISEL-NEXT: v_writelane_b32 v41, s47, 13 -; GISEL-NEXT: v_writelane_b32 v41, s48, 14 -; GISEL-NEXT: v_writelane_b32 v41, s49, 15 -; GISEL-NEXT: v_writelane_b32 v41, s50, 16 -; GISEL-NEXT: v_writelane_b32 v41, s51, 17 -; GISEL-NEXT: v_writelane_b32 v41, s52, 18 -; GISEL-NEXT: v_writelane_b32 v41, s53, 19 -; GISEL-NEXT: v_writelane_b32 v41, s54, 20 -; GISEL-NEXT: v_writelane_b32 v41, s55, 21 -; GISEL-NEXT: v_writelane_b32 v41, s56, 22 -; GISEL-NEXT: v_writelane_b32 v41, s57, 23 -; GISEL-NEXT: v_writelane_b32 v41, s58, 24 -; GISEL-NEXT: v_writelane_b32 v41, s59, 25 -; GISEL-NEXT: v_writelane_b32 v41, s60, 26 -; GISEL-NEXT: v_writelane_b32 v41, s61, 27 -; GISEL-NEXT: v_writelane_b32 v41, s62, 28 -; GISEL-NEXT: v_writelane_b32 v41, s63, 29 -; GISEL-NEXT: v_writelane_b32 v41, s30, 30 -; GISEL-NEXT: v_writelane_b32 v41, s31, 31 +; GISEL-NEXT: v_writelane_b32 v41, s48, 6 +; GISEL-NEXT: v_writelane_b32 v41, s49, 7 +; GISEL-NEXT: v_writelane_b32 v41, s50, 8 +; GISEL-NEXT: v_writelane_b32 v41, s51, 9 +; GISEL-NEXT: v_writelane_b32 v41, s52, 10 +; GISEL-NEXT: v_writelane_b32 v41, s53, 11 +; GISEL-NEXT: v_writelane_b32 v41, s54, 12 +; GISEL-NEXT: v_writelane_b32 v41, s55, 13 +; GISEL-NEXT: v_writelane_b32 v41, s30, 14 +; GISEL-NEXT: v_writelane_b32 v41, s31, 15 ; GISEL-NEXT: v_mov_b32_e32 v40, v0 ; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 @@ -1191,32 +1079,16 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, v40 -; GISEL-NEXT: v_readlane_b32 s30, v41, 30 -; GISEL-NEXT: v_readlane_b32 s31, v41, 31 -; GISEL-NEXT: v_readlane_b32 s63, v41, 29 -; GISEL-NEXT: v_readlane_b32 s62, v41, 28 -; GISEL-NEXT: v_readlane_b32 s61, v41, 27 -; GISEL-NEXT: v_readlane_b32 s60, v41, 26 -; GISEL-NEXT: v_readlane_b32 s59, v41, 25 -; GISEL-NEXT: v_readlane_b32 s58, v41, 24 -; GISEL-NEXT: v_readlane_b32 s57, v41, 23 -; GISEL-NEXT: v_readlane_b32 s56, v41, 22 -; GISEL-NEXT: v_readlane_b32 s55, v41, 21 -; GISEL-NEXT: v_readlane_b32 s54, v41, 20 -; GISEL-NEXT: v_readlane_b32 s53, v41, 19 -; GISEL-NEXT: v_readlane_b32 s52, v41, 18 -; GISEL-NEXT: v_readlane_b32 s51, v41, 17 -; GISEL-NEXT: v_readlane_b32 s50, v41, 16 -; GISEL-NEXT: v_readlane_b32 s49, v41, 15 -; GISEL-NEXT: v_readlane_b32 s48, v41, 14 -; GISEL-NEXT: v_readlane_b32 s47, v41, 13 -; GISEL-NEXT: v_readlane_b32 s46, v41, 12 -; GISEL-NEXT: v_readlane_b32 s45, v41, 11 -; GISEL-NEXT: v_readlane_b32 s44, v41, 10 -; GISEL-NEXT: v_readlane_b32 s43, v41, 9 -; GISEL-NEXT: v_readlane_b32 s42, v41, 8 -; GISEL-NEXT: v_readlane_b32 s41, v41, 7 -; GISEL-NEXT: v_readlane_b32 s40, v41, 6 +; GISEL-NEXT: v_readlane_b32 s30, v41, 14 +; GISEL-NEXT: v_readlane_b32 s31, v41, 15 +; GISEL-NEXT: v_readlane_b32 s55, v41, 13 +; GISEL-NEXT: v_readlane_b32 s54, v41, 12 +; GISEL-NEXT: v_readlane_b32 s53, v41, 11 +; GISEL-NEXT: v_readlane_b32 s52, v41, 10 +; GISEL-NEXT: v_readlane_b32 s51, v41, 9 +; GISEL-NEXT: v_readlane_b32 s50, v41, 8 +; GISEL-NEXT: v_readlane_b32 s49, v41, 7 +; GISEL-NEXT: v_readlane_b32 s48, v41, 6 ; GISEL-NEXT: v_readlane_b32 s39, v41, 5 ; GISEL-NEXT: v_readlane_b32 s38, v41, 4 ; GISEL-NEXT: v_readlane_b32 s37, v41, 3 @@ -1255,32 +1127,16 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s37, 3 ; GCN-NEXT: v_writelane_b32 v40, s38, 4 ; GCN-NEXT: v_writelane_b32 v40, s39, 5 -; GCN-NEXT: v_writelane_b32 v40, s40, 6 -; GCN-NEXT: v_writelane_b32 v40, s41, 7 -; GCN-NEXT: v_writelane_b32 v40, s42, 8 -; GCN-NEXT: v_writelane_b32 v40, s43, 9 -; GCN-NEXT: v_writelane_b32 v40, s44, 10 -; GCN-NEXT: v_writelane_b32 v40, s45, 11 -; GCN-NEXT: v_writelane_b32 v40, s46, 12 -; GCN-NEXT: v_writelane_b32 v40, s47, 13 -; GCN-NEXT: v_writelane_b32 v40, s48, 14 -; GCN-NEXT: v_writelane_b32 v40, s49, 15 -; GCN-NEXT: v_writelane_b32 v40, s50, 16 -; GCN-NEXT: v_writelane_b32 v40, s51, 17 -; GCN-NEXT: v_writelane_b32 v40, s52, 18 -; GCN-NEXT: v_writelane_b32 v40, s53, 19 -; GCN-NEXT: v_writelane_b32 v40, s54, 20 -; GCN-NEXT: v_writelane_b32 v40, s55, 21 -; GCN-NEXT: v_writelane_b32 v40, s56, 22 -; GCN-NEXT: v_writelane_b32 v40, s57, 23 -; GCN-NEXT: v_writelane_b32 v40, s58, 24 -; GCN-NEXT: v_writelane_b32 v40, s59, 25 -; GCN-NEXT: v_writelane_b32 v40, s60, 26 -; GCN-NEXT: v_writelane_b32 v40, s61, 27 -; GCN-NEXT: v_writelane_b32 v40, s62, 28 -; GCN-NEXT: v_writelane_b32 v40, s63, 29 -; GCN-NEXT: v_writelane_b32 v40, s30, 30 -; GCN-NEXT: v_writelane_b32 v40, s31, 31 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s30, 14 +; GCN-NEXT: v_writelane_b32 v40, s31, 15 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s8, v1 @@ -1296,32 +1152,16 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v3 -; GCN-NEXT: v_readlane_b32 s30, v40, 30 -; GCN-NEXT: v_readlane_b32 s31, v40, 31 -; GCN-NEXT: v_readlane_b32 s63, v40, 29 -; GCN-NEXT: v_readlane_b32 s62, v40, 28 -; GCN-NEXT: v_readlane_b32 s61, v40, 27 -; GCN-NEXT: v_readlane_b32 s60, v40, 26 -; GCN-NEXT: v_readlane_b32 s59, v40, 25 -; GCN-NEXT: v_readlane_b32 s58, v40, 24 -; GCN-NEXT: v_readlane_b32 s57, v40, 23 -; GCN-NEXT: v_readlane_b32 s56, v40, 22 -; GCN-NEXT: v_readlane_b32 s55, v40, 21 -; GCN-NEXT: v_readlane_b32 s54, v40, 20 -; GCN-NEXT: v_readlane_b32 s53, v40, 19 -; GCN-NEXT: v_readlane_b32 s52, v40, 18 -; GCN-NEXT: v_readlane_b32 s51, v40, 17 -; GCN-NEXT: v_readlane_b32 s50, v40, 16 -; GCN-NEXT: v_readlane_b32 s49, v40, 15 -; GCN-NEXT: v_readlane_b32 s48, v40, 14 -; GCN-NEXT: v_readlane_b32 s47, v40, 13 -; GCN-NEXT: v_readlane_b32 s46, v40, 12 -; GCN-NEXT: v_readlane_b32 s45, v40, 11 -; GCN-NEXT: v_readlane_b32 s44, v40, 10 -; GCN-NEXT: v_readlane_b32 s43, v40, 9 -; GCN-NEXT: v_readlane_b32 s42, v40, 8 -; GCN-NEXT: v_readlane_b32 s41, v40, 7 -; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s30, v40, 14 +; GCN-NEXT: v_readlane_b32 s31, v40, 15 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 ; GCN-NEXT: v_readlane_b32 s39, v40, 5 ; GCN-NEXT: v_readlane_b32 s38, v40, 4 ; GCN-NEXT: v_readlane_b32 s37, v40, 3 @@ -1351,32 +1191,16 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 3 ; GISEL-NEXT: v_writelane_b32 v40, s38, 4 ; GISEL-NEXT: v_writelane_b32 v40, s39, 5 -; GISEL-NEXT: v_writelane_b32 v40, s40, 6 -; GISEL-NEXT: v_writelane_b32 v40, s41, 7 -; GISEL-NEXT: v_writelane_b32 v40, s42, 8 -; GISEL-NEXT: v_writelane_b32 v40, s43, 9 -; GISEL-NEXT: v_writelane_b32 v40, s44, 10 -; GISEL-NEXT: v_writelane_b32 v40, s45, 11 -; GISEL-NEXT: v_writelane_b32 v40, s46, 12 -; GISEL-NEXT: v_writelane_b32 v40, s47, 13 -; GISEL-NEXT: v_writelane_b32 v40, s48, 14 -; GISEL-NEXT: v_writelane_b32 v40, s49, 15 -; GISEL-NEXT: v_writelane_b32 v40, s50, 16 -; GISEL-NEXT: v_writelane_b32 v40, s51, 17 -; GISEL-NEXT: v_writelane_b32 v40, s52, 18 -; GISEL-NEXT: v_writelane_b32 v40, s53, 19 -; GISEL-NEXT: v_writelane_b32 v40, s54, 20 -; GISEL-NEXT: v_writelane_b32 v40, s55, 21 -; GISEL-NEXT: v_writelane_b32 v40, s56, 22 -; GISEL-NEXT: v_writelane_b32 v40, s57, 23 -; GISEL-NEXT: v_writelane_b32 v40, s58, 24 -; GISEL-NEXT: v_writelane_b32 v40, s59, 25 -; GISEL-NEXT: v_writelane_b32 v40, s60, 26 -; GISEL-NEXT: v_writelane_b32 v40, s61, 27 -; GISEL-NEXT: v_writelane_b32 v40, s62, 28 -; GISEL-NEXT: v_writelane_b32 v40, s63, 29 -; GISEL-NEXT: v_writelane_b32 v40, s30, 30 -; GISEL-NEXT: v_writelane_b32 v40, s31, 31 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s30, 14 +; GISEL-NEXT: v_writelane_b32 v40, s31, 15 ; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s8, v1 @@ -1392,32 +1216,16 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, v2 -; GISEL-NEXT: v_readlane_b32 s30, v40, 30 -; GISEL-NEXT: v_readlane_b32 s31, v40, 31 -; GISEL-NEXT: v_readlane_b32 s63, v40, 29 -; GISEL-NEXT: v_readlane_b32 s62, v40, 28 -; GISEL-NEXT: v_readlane_b32 s61, v40, 27 -; GISEL-NEXT: v_readlane_b32 s60, v40, 26 -; GISEL-NEXT: v_readlane_b32 s59, v40, 25 -; GISEL-NEXT: v_readlane_b32 s58, v40, 24 -; GISEL-NEXT: v_readlane_b32 s57, v40, 23 -; GISEL-NEXT: v_readlane_b32 s56, v40, 22 -; GISEL-NEXT: v_readlane_b32 s55, v40, 21 -; GISEL-NEXT: v_readlane_b32 s54, v40, 20 -; GISEL-NEXT: v_readlane_b32 s53, v40, 19 -; GISEL-NEXT: v_readlane_b32 s52, v40, 18 -; GISEL-NEXT: v_readlane_b32 s51, v40, 17 -; GISEL-NEXT: v_readlane_b32 s50, v40, 16 -; GISEL-NEXT: v_readlane_b32 s49, v40, 15 -; GISEL-NEXT: v_readlane_b32 s48, v40, 14 -; GISEL-NEXT: v_readlane_b32 s47, v40, 13 -; GISEL-NEXT: v_readlane_b32 s46, v40, 12 -; GISEL-NEXT: v_readlane_b32 s45, v40, 11 -; GISEL-NEXT: v_readlane_b32 s44, v40, 10 -; GISEL-NEXT: v_readlane_b32 s43, v40, 9 -; GISEL-NEXT: v_readlane_b32 s42, v40, 8 -; GISEL-NEXT: v_readlane_b32 s41, v40, 7 -; GISEL-NEXT: v_readlane_b32 s40, v40, 6 +; GISEL-NEXT: v_readlane_b32 s30, v40, 14 +; GISEL-NEXT: v_readlane_b32 s31, v40, 15 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 ; GISEL-NEXT: v_readlane_b32 s39, v40, 5 ; GISEL-NEXT: v_readlane_b32 s38, v40, 4 ; GISEL-NEXT: v_readlane_b32 s37, v40, 3 @@ -1452,32 +1260,16 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s37, 3 ; GCN-NEXT: v_writelane_b32 v40, s38, 4 ; GCN-NEXT: v_writelane_b32 v40, s39, 5 -; GCN-NEXT: v_writelane_b32 v40, s40, 6 -; GCN-NEXT: v_writelane_b32 v40, s41, 7 -; GCN-NEXT: v_writelane_b32 v40, s42, 8 -; GCN-NEXT: v_writelane_b32 v40, s43, 9 -; GCN-NEXT: v_writelane_b32 v40, s44, 10 -; GCN-NEXT: v_writelane_b32 v40, s45, 11 -; GCN-NEXT: v_writelane_b32 v40, s46, 12 -; GCN-NEXT: v_writelane_b32 v40, s47, 13 -; GCN-NEXT: v_writelane_b32 v40, s48, 14 -; GCN-NEXT: v_writelane_b32 v40, s49, 15 -; GCN-NEXT: v_writelane_b32 v40, s50, 16 -; GCN-NEXT: v_writelane_b32 v40, s51, 17 -; GCN-NEXT: v_writelane_b32 v40, s52, 18 -; GCN-NEXT: v_writelane_b32 v40, s53, 19 -; GCN-NEXT: v_writelane_b32 v40, s54, 20 -; GCN-NEXT: v_writelane_b32 v40, s55, 21 -; GCN-NEXT: v_writelane_b32 v40, s56, 22 -; GCN-NEXT: v_writelane_b32 v40, s57, 23 -; GCN-NEXT: v_writelane_b32 v40, s58, 24 -; GCN-NEXT: v_writelane_b32 v40, s59, 25 -; GCN-NEXT: v_writelane_b32 v40, s60, 26 -; GCN-NEXT: v_writelane_b32 v40, s61, 27 -; GCN-NEXT: v_writelane_b32 v40, s62, 28 -; GCN-NEXT: v_writelane_b32 v40, s63, 29 -; GCN-NEXT: v_writelane_b32 v40, s30, 30 -; GCN-NEXT: v_writelane_b32 v40, s31, 31 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s30, 14 +; GCN-NEXT: v_writelane_b32 v40, s31, 15 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s6, v0 @@ -1490,32 +1282,16 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: s_cbranch_execnz .LBB9_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_readlane_b32 s30, v40, 30 -; GCN-NEXT: v_readlane_b32 s31, v40, 31 -; GCN-NEXT: v_readlane_b32 s63, v40, 29 -; GCN-NEXT: v_readlane_b32 s62, v40, 28 -; GCN-NEXT: v_readlane_b32 s61, v40, 27 -; GCN-NEXT: v_readlane_b32 s60, v40, 26 -; GCN-NEXT: v_readlane_b32 s59, v40, 25 -; GCN-NEXT: v_readlane_b32 s58, v40, 24 -; GCN-NEXT: v_readlane_b32 s57, v40, 23 -; GCN-NEXT: v_readlane_b32 s56, v40, 22 -; GCN-NEXT: v_readlane_b32 s55, v40, 21 -; GCN-NEXT: v_readlane_b32 s54, v40, 20 -; GCN-NEXT: v_readlane_b32 s53, v40, 19 -; GCN-NEXT: v_readlane_b32 s52, v40, 18 -; GCN-NEXT: v_readlane_b32 s51, v40, 17 -; GCN-NEXT: v_readlane_b32 s50, v40, 16 -; GCN-NEXT: v_readlane_b32 s49, v40, 15 -; GCN-NEXT: v_readlane_b32 s48, v40, 14 -; GCN-NEXT: v_readlane_b32 s47, v40, 13 -; GCN-NEXT: v_readlane_b32 s46, v40, 12 -; GCN-NEXT: v_readlane_b32 s45, v40, 11 -; GCN-NEXT: v_readlane_b32 s44, v40, 10 -; GCN-NEXT: v_readlane_b32 s43, v40, 9 -; GCN-NEXT: v_readlane_b32 s42, v40, 8 -; GCN-NEXT: v_readlane_b32 s41, v40, 7 -; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s30, v40, 14 +; GCN-NEXT: v_readlane_b32 s31, v40, 15 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 ; GCN-NEXT: v_readlane_b32 s39, v40, 5 ; GCN-NEXT: v_readlane_b32 s38, v40, 4 ; GCN-NEXT: v_readlane_b32 s37, v40, 3 @@ -1545,32 +1321,16 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s37, 3 ; GISEL-NEXT: v_writelane_b32 v40, s38, 4 ; GISEL-NEXT: v_writelane_b32 v40, s39, 5 -; GISEL-NEXT: v_writelane_b32 v40, s40, 6 -; GISEL-NEXT: v_writelane_b32 v40, s41, 7 -; GISEL-NEXT: v_writelane_b32 v40, s42, 8 -; GISEL-NEXT: v_writelane_b32 v40, s43, 9 -; GISEL-NEXT: v_writelane_b32 v40, s44, 10 -; GISEL-NEXT: v_writelane_b32 v40, s45, 11 -; GISEL-NEXT: v_writelane_b32 v40, s46, 12 -; GISEL-NEXT: v_writelane_b32 v40, s47, 13 -; GISEL-NEXT: v_writelane_b32 v40, s48, 14 -; GISEL-NEXT: v_writelane_b32 v40, s49, 15 -; GISEL-NEXT: v_writelane_b32 v40, s50, 16 -; GISEL-NEXT: v_writelane_b32 v40, s51, 17 -; GISEL-NEXT: v_writelane_b32 v40, s52, 18 -; GISEL-NEXT: v_writelane_b32 v40, s53, 19 -; GISEL-NEXT: v_writelane_b32 v40, s54, 20 -; GISEL-NEXT: v_writelane_b32 v40, s55, 21 -; GISEL-NEXT: v_writelane_b32 v40, s56, 22 -; GISEL-NEXT: v_writelane_b32 v40, s57, 23 -; GISEL-NEXT: v_writelane_b32 v40, s58, 24 -; GISEL-NEXT: v_writelane_b32 v40, s59, 25 -; GISEL-NEXT: v_writelane_b32 v40, s60, 26 -; GISEL-NEXT: v_writelane_b32 v40, s61, 27 -; GISEL-NEXT: v_writelane_b32 v40, s62, 28 -; GISEL-NEXT: v_writelane_b32 v40, s63, 29 -; GISEL-NEXT: v_writelane_b32 v40, s30, 30 -; GISEL-NEXT: v_writelane_b32 v40, s31, 31 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s30, 14 +; GISEL-NEXT: v_writelane_b32 v40, s31, 15 ; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s6, v0 @@ -1583,32 +1343,16 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: s_cbranch_execnz .LBB9_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_readlane_b32 s30, v40, 30 -; GISEL-NEXT: v_readlane_b32 s31, v40, 31 -; GISEL-NEXT: v_readlane_b32 s63, v40, 29 -; GISEL-NEXT: v_readlane_b32 s62, v40, 28 -; GISEL-NEXT: v_readlane_b32 s61, v40, 27 -; GISEL-NEXT: v_readlane_b32 s60, v40, 26 -; GISEL-NEXT: v_readlane_b32 s59, v40, 25 -; GISEL-NEXT: v_readlane_b32 s58, v40, 24 -; GISEL-NEXT: v_readlane_b32 s57, v40, 23 -; GISEL-NEXT: v_readlane_b32 s56, v40, 22 -; GISEL-NEXT: v_readlane_b32 s55, v40, 21 -; GISEL-NEXT: v_readlane_b32 s54, v40, 20 -; GISEL-NEXT: v_readlane_b32 s53, v40, 19 -; GISEL-NEXT: v_readlane_b32 s52, v40, 18 -; GISEL-NEXT: v_readlane_b32 s51, v40, 17 -; GISEL-NEXT: v_readlane_b32 s50, v40, 16 -; GISEL-NEXT: v_readlane_b32 s49, v40, 15 -; GISEL-NEXT: v_readlane_b32 s48, v40, 14 -; GISEL-NEXT: v_readlane_b32 s47, v40, 13 -; GISEL-NEXT: v_readlane_b32 s46, v40, 12 -; GISEL-NEXT: v_readlane_b32 s45, v40, 11 -; GISEL-NEXT: v_readlane_b32 s44, v40, 10 -; GISEL-NEXT: v_readlane_b32 s43, v40, 9 -; GISEL-NEXT: v_readlane_b32 s42, v40, 8 -; GISEL-NEXT: v_readlane_b32 s41, v40, 7 -; GISEL-NEXT: v_readlane_b32 s40, v40, 6 +; GISEL-NEXT: v_readlane_b32 s30, v40, 14 +; GISEL-NEXT: v_readlane_b32 s31, v40, 15 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 ; GISEL-NEXT: v_readlane_b32 s39, v40, 5 ; GISEL-NEXT: v_readlane_b32 s38, v40, 4 ; GISEL-NEXT: v_readlane_b32 s37, v40, 3 diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll index 22e3cc4b047b1..6370484327ed7 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll @@ -1,23 +1,18 @@ -; RUN: opt -mtriple=amdgcn--amdhsa -S -O3 -enable-unsafe-fp-math %s | FileCheck -check-prefix=GCN -check-prefix=UNSAFE %s -; RUN: opt -mtriple=amdgcn--amdhsa -S -O3 -enable-no-nans-fp-math %s | FileCheck -check-prefix=GCN -check-prefix=NONANS %s -; RUN: opt -mtriple=amdgcn--amdhsa -S -O3 -enable-no-infs-fp-math %s | FileCheck -check-prefix=GCN -check-prefix=NOINFS %s - -; GCN: define float @foo(float %x) local_unnamed_addr #0 { -; GCN: define amdgpu_kernel void @caller(ptr addrspace(1) nocapture %p) local_unnamed_addr #1 { -; GCN: %mul.i = fmul float %load, 1.500000e+01 - -; UNSAFE: attributes #0 = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; UNSAFE: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } - -; NOINFS: attributes #0 = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; NOINFS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } - -; NONANS: attributes #0 = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; NONANS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=GCN,UNSAFE %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-nans-fp-math %s | FileCheck --check-prefixes=GCN,NONANS %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-infs-fp-math %s | FileCheck --check-prefixes=GCN,NOINFS %s declare void @extern() #0 define float @foo(float %x) #0 { +; GCN-LABEL: define float @foo( +; GCN-SAME: float [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; GCN-NEXT: [[ENTRY:.*:]] +; GCN-NEXT: tail call void @extern() +; GCN-NEXT: [[MUL:%.*]] = fmul float [[X]], 1.500000e+01 +; GCN-NEXT: ret float [[MUL]] +; entry: call void @extern() %mul = fmul float %x, 1.500000e+01 @@ -25,6 +20,15 @@ entry: } define amdgpu_kernel void @caller(ptr addrspace(1) %p) #1 { +; GCN-LABEL: define amdgpu_kernel void @caller( +; GCN-SAME: ptr addrspace(1) nocapture [[P:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; GCN-NEXT: [[ENTRY:.*:]] +; GCN-NEXT: [[LOAD:%.*]] = load float, ptr addrspace(1) [[P]], align 4, !amdgpu.noclobber [[META0:![0-9]+]] +; GCN-NEXT: tail call void @extern() +; GCN-NEXT: [[MUL_I:%.*]] = fmul float [[LOAD]], 1.500000e+01 +; GCN-NEXT: store float [[MUL_I]], ptr addrspace(1) [[P]], align 4 +; GCN-NEXT: ret void +; entry: %load = load float, ptr addrspace(1) %p, align 4 %call = call fast float @foo(float %load) @@ -34,3 +38,24 @@ entry: attributes #0 = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true"} attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } + +;. +; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +;. +; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +;. +; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +;. +; UNSAFE: [[META0]] = !{} +;. +; NONANS: [[META0]] = !{} +;. +; NOINFS: [[META0]] = !{} +;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; NOINFS: {{.*}} +; NONANS: {{.*}} +; UNSAFE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll index b6c2a19470a7f..12f7fe317bb30 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll @@ -185,6 +185,38 @@ define fastcc i32 @foo() { ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; CHECK-NEXT: S_WAITCNT 0 ; CHECK-NEXT: $sgpr16 = S_MOV_B32 $sgpr33 ; CHECK-NEXT: $sgpr33 = S_MOV_B32 $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index add8c0f75bf33..0c9d45078ff27 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -654,7 +654,7 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 m0, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 @@ -671,7 +671,6 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v ; GCN-NEXT: v_mov_b32_e32 v13, s21 ; GCN-NEXT: v_mov_b32_e32 v14, s22 ; GCN-NEXT: v_mov_b32_e32 v15, s23 -; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 48 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s3, s1, 0 @@ -720,15 +719,14 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NEXT: v_mov_b32_e32 v6, s14 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v7, s15 ; GCN-NEXT: v_mov_b32_e32 v9, s1 ; GCN-NEXT: v_mov_b32_e32 v10, s2 ; GCN-NEXT: v_mov_b32_e32 v11, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s16 ; GCN-NEXT: v_mov_b32_e32 v13, s17 -; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 m0, s0, 1 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_add_u32 s0, s6, 16 ; GCN-NEXT: v_movreld_b32_e32 v1, v16 @@ -765,8 +763,7 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NEXT: s_lshl_b32 s0, s0, 1 -; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_lshl_b32 m0, s0, 1 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 @@ -872,8 +869,7 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s0, s0, 1 -; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_lshl_b32 m0, s0, 1 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NEXT: v_mov_b32_e32 v6, s14 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 72cda5c718f5b..9827dba9f5f69 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -2264,13 +2264,12 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 { ; SI-LABEL: dynamic_insertelement_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[8:9], 0x20 ; SI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x10 +; SI-NEXT: s_load_dword s4, s[8:9], 0x20 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 1 ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_mov_b32_e32 v2, s14 @@ -2287,7 +2286,7 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 ; SI-NEXT: v_mov_b32_e32 v13, s25 ; SI-NEXT: v_mov_b32_e32 v14, s26 ; SI-NEXT: v_mov_b32_e32 v15, s27 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: s_lshl_b32 m0, s4, 1 ; SI-NEXT: v_movreld_b32_e32 v0, 0 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_movreld_b32_e32 v1, v16 @@ -2299,13 +2298,12 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[8:9], 0x80 ; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 +; VI-NEXT: s_load_dword s4, s[8:9], 0x80 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 @@ -2322,7 +2320,7 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 ; VI-NEXT: v_mov_b32_e32 v13, s25 ; VI-NEXT: v_mov_b32_e32 v14, s26 ; VI-NEXT: v_mov_b32_e32 v15, s27 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_lshl_b32 m0, s4, 1 ; VI-NEXT: v_movreld_b32_e32 v0, 0 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v1, v16 diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index fbeda72725b2a..5256cbcef123a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -289,7 +289,6 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst ret i32 %result @@ -591,7 +590,6 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) ; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe ; GFX12-NEXT: s_wait_alu 0xfffe @@ -802,7 +800,6 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_i32 s0, s0, 5 @@ -1050,7 +1047,6 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_i32 s1, s1, 5 @@ -1060,12 +1056,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB7_2: -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1236,7 +1232,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_i32 s1, s1, 5 @@ -1245,7 +1240,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: .LBB8_2: -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index edf900a50cd4b..c0de009e935e6 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5123,6 +5123,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v7, v4, v3 @@ -5131,9 +5132,10 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4 ; GFX1200-SDAG-NEXT: v_add3_u32 v1, v1, v7, v6 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v6, v4, v3 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v2, 0 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo ; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2 @@ -5149,6 +5151,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v4, v1, v3 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v3, v[0:1] ; GFX1200-SDAG-NEXT: v_add3_u32 v1, v4, v1, v2 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-GISEL-LABEL: clpeak_imad_pat_i64: @@ -5159,6 +5162,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v4, v2 @@ -5170,6 +5174,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mov_b32_e32 v7, v0 ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v4, v2 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v3, v[0:1] @@ -5179,10 +5184,12 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v2, v[0:1] ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v3, v4 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v5, v3, v4 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mov_b32_e32 v6, v0 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v7, vcc_lo ; GFX1200-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v3, 1 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v2, v[1:2] @@ -5193,6 +5200,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v6, v[2:3] ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v7, v[1:2] +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add i64 %x, 1 @@ -5969,8 +5977,10 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1200-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo ; GFX1200-SDAG-NEXT: v_add_co_u32 v10, vcc_lo, v2, 1 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo ; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v12, v9, v4 @@ -5983,11 +5993,12 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-SDAG-NEXT: v_add3_u32 v12, v1, v13, v12 ; GFX1200-SDAG-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8 ; GFX1200-SDAG-NEXT: v_add3_u32 v13, v3, v15, v14 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v12, v9, vcc_lo ; GFX1200-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v2, v10 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v13, v11, vcc_lo -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v10, v3, v4 ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v11, v1, v5 ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v7, v8, v7 @@ -6019,6 +6030,7 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_add3_u32 v1, v9, v1, v4 ; GFX1200-SDAG-NEXT: v_add3_u32 v3, v10, v3, v6 +; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i64: @@ -6029,10 +6041,12 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo ; GFX1200-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v2, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v8, v4 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v10, v6 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v12, v8, v4 @@ -6043,9 +6057,10 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v9, v4, v[2:3] ; GFX1200-GISEL-NEXT: v_mov_b32_e32 v14, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v11, v6, v[0:1] ; GFX1200-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v12, v8 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v14, v9, vcc_lo ; GFX1200-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v13, v10 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) @@ -6054,12 +6069,13 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v15, v3, v4 ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v10, v6 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v16, v10, v6 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v8, v11, vcc_lo ; GFX1200-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v12, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v3, v5, v[0:1] +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v14, vcc_lo -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v10, v7, v[1:2] ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v15, v12 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -6072,16 +6088,18 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v16, v9 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v14, v16, v9 ; GFX1200-GISEL-NEXT: v_mov_b32_e32 v11, v3 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v8, v15, v12 ; GFX1200-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v15, 1 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo ; GFX1200-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v16, 1 +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v11, vcc_lo -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v8, v10 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[5:6], null, v16, v7, v[0:1] -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v14, v15 ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[6:7], null, v2, v12, v[4:5] ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v8, v13, v[1:2] @@ -6093,6 +6111,7 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v8, v10 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v2, v14, v15 ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v15, v[4:5] +; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i64> %x, @@ -9354,7 +9373,9 @@ define i64 @mul_u24_zext_add64(i32 %x, i32 %y, i64 %z) { ; GFX1200-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX1200-NEXT: s_wait_alu 0xfffd ; GFX1200-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX1200-NEXT: s_wait_alu 0xfffd ; GFX1200-NEXT: s_setpc_b64 s[30:31] %mul = call i32 @llvm.amdgcn.mul.u24(i32 %x, i32 %y) %mul.zext = zext i32 %mul to i64 diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll index ae309f3a614d5..a9fc6a8c4d210 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll @@ -20,13 +20,13 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] @@ -50,13 +50,13 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] @@ -85,11 +85,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] @@ -115,11 +115,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] @@ -148,13 +148,13 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] @@ -178,13 +178,13 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] @@ -213,11 +213,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] @@ -243,11 +243,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] @@ -276,13 +276,13 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] @@ -306,13 +306,13 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] @@ -341,11 +341,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] @@ -371,11 +371,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] @@ -404,13 +404,13 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] @@ -434,13 +434,13 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] @@ -469,11 +469,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] @@ -499,11 +499,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] @@ -535,13 +535,13 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY5]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY6]] @@ -564,13 +564,13 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY6]] @@ -596,17 +596,17 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] @@ -633,17 +633,17 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY16]] @@ -672,7 +672,7 @@ define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32 ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY4]] ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY2]], 0, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 @@ -688,7 +688,7 @@ define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32 ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY4]] ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 @@ -707,12 +707,12 @@ define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32 ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY]] @@ -730,12 +730,12 @@ define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32 ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] @@ -760,13 +760,13 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY4]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY5]] @@ -791,13 +791,13 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY4]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY5]] @@ -827,11 +827,11 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] @@ -857,11 +857,11 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY11]] ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -893,13 +893,13 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i32 i ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY6]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY7]] @@ -923,13 +923,13 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i32 i ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY6]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY7]] @@ -956,17 +956,17 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i32 i ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_4]] @@ -993,17 +993,17 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i32 i ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY17]] ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll index 90707e823c147..dfd1f0685a931 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll @@ -20,13 +20,13 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] @@ -50,13 +50,13 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] @@ -85,11 +85,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] @@ -115,11 +115,11 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] @@ -148,13 +148,13 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] @@ -178,13 +178,13 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] @@ -213,11 +213,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] @@ -243,11 +243,11 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] @@ -276,13 +276,13 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] @@ -306,13 +306,13 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] @@ -341,11 +341,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] @@ -371,11 +371,11 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] @@ -404,13 +404,13 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] @@ -434,13 +434,13 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] @@ -469,11 +469,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] @@ -499,11 +499,11 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] @@ -535,13 +535,13 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY5]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY6]] @@ -564,13 +564,13 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY6]] @@ -596,17 +596,17 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] @@ -633,17 +633,17 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY16]] @@ -672,7 +672,7 @@ define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32 ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY4]] ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 [[COPY2]], 0, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 @@ -688,7 +688,7 @@ define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32 ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY4]] ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 @@ -707,12 +707,12 @@ define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32 ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY]] @@ -730,12 +730,12 @@ define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32 ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] @@ -761,13 +761,13 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY5]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY6]] @@ -794,13 +794,13 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY6]] @@ -832,11 +832,11 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] @@ -864,11 +864,11 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY12]] ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] @@ -902,13 +902,13 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i64 i ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY7]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY8]] @@ -934,13 +934,13 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i64 i ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr11 ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY7]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY8]] @@ -969,17 +969,17 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i64 i ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec ; DAGISEL-GFX11-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_4]] @@ -1008,17 +1008,17 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i64 i ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec ; DAGISEL-GFX10-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY18]] ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] diff --git a/llvm/test/CodeGen/AMDGPU/issue48473.mir b/llvm/test/CodeGen/AMDGPU/issue48473.mir index e272bd3480383..ec6f73080c988 100644 --- a/llvm/test/CodeGen/AMDGPU/issue48473.mir +++ b/llvm/test/CodeGen/AMDGPU/issue48473.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - 2> %t.err %s | FileCheck %s # RUN: FileCheck -check-prefix=ERR %s < %t.err @@ -43,7 +44,7 @@ # %25 to $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 # CHECK-LABEL: name: issue48473 -# CHECK: S_NOP 0, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15, implicit killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, implicit killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, implicit killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, implicit killed renamable $sgpr88_sgpr89_sgpr90_sgpr91, implicit killed renamable $sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59, implicit killed renamable $sgpr92_sgpr93_sgpr94_sgpr95, implicit killed renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, implicit killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, implicit killed renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +# CHECK: S_NOP 0, implicit killed renamable $sgpr20_sgpr21_sgpr22_sgpr23, implicit killed renamable $sgpr88_sgpr89_sgpr90_sgpr91, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit killed renamable $sgpr40_sgpr41_sgpr42_sgpr43, implicit killed renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit killed renamable $sgpr36_sgpr37_sgpr38_sgpr39, implicit killed renamable $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, implicit killed renamable $sgpr80_sgpr81_sgpr82_sgpr83, implicit killed renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit killed renamable $sgpr16_sgpr17_sgpr18_sgpr19, implicit killed renamable $sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, implicit killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, implicit killed renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55, implicit renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr44_sgpr45_sgpr46_sgpr47, implicit killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 --- name: issue48473 diff --git a/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir b/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir index b1b523c41c40c..55f21d95bcac4 100644 --- a/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir +++ b/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir @@ -456,6 +456,38 @@ body: | ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr40, 0 diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll index 3e6de32492457..44139fafbfe20 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll @@ -38,30 +38,30 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN-NEXT: v_add_u32_e32 v6, 64, v6 ; GCN-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc -; GCN-NEXT: v_sub_u32_e32 v6, 0x80, v7 -; GCN-NEXT: v_sub_u32_e32 v2, 0x7f, v7 -; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 +; GCN-NEXT: v_sub_u32_e32 v2, 0x80, v7 +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: ; %bb.2: ; %itofp-if-else -; GCN-NEXT: v_add_u32_e32 v4, 0xffffff98, v7 -; GCN-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GCN-NEXT: v_add_u32_e32 v2, 0xffffff98, v7 +; GCN-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GCN-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc -; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr2 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr7 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN-NEXT: ; %bb.3: ; %Flow3 -; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; GCN-NEXT: v_sub_u32_e32 v6, 0x7f, v7 +; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] ; GCN-NEXT: s_cbranch_execz .LBB0_13 ; GCN-NEXT: ; %bb.4: ; %NodeBlock -; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6 +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 25, v2 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB0_8 ; GCN-NEXT: ; %bb.5: ; %LeafBlock -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 26, v2 ; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_7 ; GCN-NEXT: ; %bb.6: ; %itofp-sw-default @@ -120,13 +120,13 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: ; %bb.11: ; %itofp-if-then20 ; GCN-NEXT: v_alignbit_b32 v8, v1, v0, 3 -; GCN-NEXT: v_mov_b32_e32 v2, v6 +; GCN-NEXT: v_mov_b32_e32 v6, v2 ; GCN-NEXT: ; %bb.12: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB0_13: ; %Flow4 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v3 -; GCN-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 +; GCN-NEXT: v_lshl_add_u32 v1, v6, 23, 1.0 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v8 ; GCN-NEXT: v_or3_b32 v0, v2, v0, v1 ; GCN-NEXT: v_bfe_u32 v1, v8, 16, 1 @@ -166,9 +166,8 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GCN-NEXT: v_add_u32_e32 v5, 64, v5 ; GCN-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GCN-NEXT: v_sub_u32_e32 v5, 0x80, v6 -; GCN-NEXT: v_sub_u32_e32 v4, 0x7f, v6 -; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5 +; GCN-NEXT: v_sub_u32_e32 v4, 0x80, v6 +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 25, v4 ; GCN-NEXT: ; implicit-def: $vgpr7 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -177,20 +176,21 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc -; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr6 ; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN-NEXT: ; %bb.3: ; %Flow3 -; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; GCN-NEXT: v_sub_u32_e32 v5, 0x7f, v6 +; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] ; GCN-NEXT: s_cbranch_execz .LBB1_13 ; GCN-NEXT: ; %bb.4: ; %NodeBlock -; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5 +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 25, v4 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-NEXT: s_cbranch_execz .LBB1_8 ; GCN-NEXT: ; %bb.5: ; %LeafBlock -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 26, v4 ; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_7 ; GCN-NEXT: ; %bb.6: ; %itofp-sw-default @@ -249,13 +249,13 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: ; %bb.11: ; %itofp-if-then20 ; GCN-NEXT: v_alignbit_b32 v7, v1, v0, 3 -; GCN-NEXT: v_mov_b32_e32 v4, v5 +; GCN-NEXT: v_mov_b32_e32 v5, v4 ; GCN-NEXT: ; %bb.12: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB1_13: ; %Flow4 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffff, v7 -; GCN-NEXT: v_lshl_or_b32 v0, v4, 23, v0 +; GCN-NEXT: v_lshl_or_b32 v0, v5, 23, v0 ; GCN-NEXT: v_add_u32_e32 v0, 1.0, v0 ; GCN-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GCN-NEXT: s_movk_i32 s4, 0x7fff diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index f372a54894604..c316ec71863d0 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -34,30 +34,30 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; SDAG-NEXT: v_add_u32_e32 v6, 64, v6 ; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc -; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7 -; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 -; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x80, v7 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else -; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v7 -; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v7 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc -; SDAG-NEXT: ; implicit-def: $vgpr6 +; SDAG-NEXT: ; implicit-def: $vgpr2 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: ; implicit-def: $vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v7 +; SDAG-NEXT: s_xor_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB0_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v2 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v2 ; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc ; SDAG-NEXT: s_cbranch_execz .LBB0_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default @@ -116,13 +116,13 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 3 -; SDAG-NEXT: v_mov_b32_e32 v2, v6 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB0_13: ; %Flow4 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3 -; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 +; SDAG-NEXT: v_lshl_add_u32 v1, v6, 23, 1.0 ; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v8 ; SDAG-NEXT: v_or3_b32 v4, v2, v0, v1 ; SDAG-NEXT: .LBB0_14: ; %Flow5 @@ -161,9 +161,8 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 ; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v5 -; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 -; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v8 +; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7 ; GISEL-NEXT: ; implicit-def: $vgpr4 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -172,20 +171,21 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 ; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: v_sub_u32_e32 v8, 0x7f, v5 +; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB0_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock -; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v8 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB0_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc ; GISEL-NEXT: s_cbranch_execz .LBB0_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default @@ -249,13 +249,13 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v7, v8 +; GISEL-NEXT: v_mov_b32_e32 v8, v7 ; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: .LBB0_13: ; %Flow4 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 -; GISEL-NEXT: v_lshl_add_u32 v1, v7, 23, 1.0 +; GISEL-NEXT: v_lshl_add_u32 v1, v8, 23, 1.0 ; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 ; GISEL-NEXT: v_or3_b32 v4, v2, v0, v1 ; GISEL-NEXT: .LBB0_14: ; %Flow5 @@ -288,9 +288,8 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 ; SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; SDAG-NEXT: v_sub_u32_e32 v5, 0x80, v6 -; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 -; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: v_sub_u32_e32 v4, 0x80, v6 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v4 ; SDAG-NEXT: ; implicit-def: $vgpr7 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -299,20 +298,21 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc -; SDAG-NEXT: ; implicit-def: $vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr4 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: ; implicit-def: $vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: v_sub_u32_e32 v5, 0x7f, v6 +; SDAG-NEXT: s_xor_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB1_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v4 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v4 ; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc ; SDAG-NEXT: s_cbranch_execz .LBB1_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default @@ -371,13 +371,13 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 3 -; SDAG-NEXT: v_mov_b32_e32 v4, v5 +; SDAG-NEXT: v_mov_b32_e32 v5, v4 ; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB1_13: ; %Flow4 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v7 -; SDAG-NEXT: v_lshl_or_b32 v0, v4, 23, v0 +; SDAG-NEXT: v_lshl_or_b32 v0, v5, 23, v0 ; SDAG-NEXT: v_add_u32_e32 v4, 1.0, v0 ; SDAG-NEXT: .LBB1_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -406,9 +406,8 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 ; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5 -; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 -; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7 +; GISEL-NEXT: v_sub_u32_e32 v6, 0x80, v5 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v6 ; GISEL-NEXT: ; implicit-def: $vgpr4 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -417,20 +416,21 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GISEL-NEXT: ; implicit-def: $vgpr7 +; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 ; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 +; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB1_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock -; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v6 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB1_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc ; GISEL-NEXT: s_cbranch_execz .LBB1_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default @@ -494,12 +494,12 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v6, v7 +; GISEL-NEXT: v_mov_b32_e32 v7, v6 ; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: .LBB1_13: ; %Flow4 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_lshl_add_u32 v0, v6, 23, 1.0 +; GISEL-NEXT: v_lshl_add_u32 v0, v7, 23, 1.0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GISEL-NEXT: v_and_or_b32 v4, v4, v1, v0 ; GISEL-NEXT: .LBB1_14: ; %Flow5 @@ -545,32 +545,32 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; SDAG-NEXT: v_add_u32_e32 v1, 64, v1 ; SDAG-NEXT: v_cndmask_b32_e32 v9, v1, v0, vcc -; SDAG-NEXT: v_sub_u32_e32 v8, 0x80, v9 -; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v9 -; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v8 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x80, v9 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v2 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else -; SDAG-NEXT: v_add_u32_e32 v6, 0xffffffb5, v9 -; SDAG-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffffb5, v9 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; implicit-def: $vgpr2 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: v_sub_u32_e32 v8, 0x7f, v9 +; SDAG-NEXT: s_xor_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB2_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v8 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v2 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v8 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v2 ; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc ; SDAG-NEXT: s_cbranch_execz .LBB2_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default @@ -640,16 +640,16 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], 3, v[4:5] -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 29, v6 -; SDAG-NEXT: v_or_b32_e32 v10, v1, v2 -; SDAG-NEXT: v_mov_b32_e32 v2, v8 +; SDAG-NEXT: v_lshlrev_b32_e32 v4, 29, v6 +; SDAG-NEXT: v_or_b32_e32 v10, v1, v4 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 ; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB2_13: ; %Flow4 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_mov_b32_e32 v2, 0x3ff00000 ; SDAG-NEXT: v_and_b32_e32 v1, 0x80000000, v3 -; SDAG-NEXT: v_mov_b32_e32 v3, 0x3ff00000 -; SDAG-NEXT: v_lshl_add_u32 v2, v2, 20, v3 +; SDAG-NEXT: v_lshl_add_u32 v2, v8, 20, v2 ; SDAG-NEXT: v_and_b32_e32 v3, 0xfffff, v10 ; SDAG-NEXT: v_or3_b32 v1, v3, v1, v2 ; SDAG-NEXT: .LBB2_14: ; %Flow5 @@ -690,9 +690,8 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_add_u32_e32 v0, 64, v0 ; GISEL-NEXT: v_min_u32_e32 v1, v1, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v0, vcc -; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v9 -; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v9 -; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v8 +; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v9 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v7 ; GISEL-NEXT: ; implicit-def: $vgpr10 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -703,19 +702,20 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc -; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr2 -; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: v_sub_u32_e32 v8, 0x7f, v9 +; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB2_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock -; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v8 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v7 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB2_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v7 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc ; GISEL-NEXT: s_cbranch_execz .LBB2_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default @@ -791,7 +791,7 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 3, v[2:3] -; GISEL-NEXT: v_mov_b32_e32 v7, v8 +; GISEL-NEXT: v_mov_b32_e32 v8, v7 ; GISEL-NEXT: v_lshl_or_b32 v10, v4, 29, v1 ; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] @@ -800,7 +800,7 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_and_b32_e32 v1, 0x80000000, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0x3ff00000 ; GISEL-NEXT: v_mov_b32_e32 v3, 0xfffff -; GISEL-NEXT: v_lshl_add_u32 v2, v7, 20, v2 +; GISEL-NEXT: v_lshl_add_u32 v2, v8, 20, v2 ; GISEL-NEXT: v_and_or_b32 v1, v10, v3, v1 ; GISEL-NEXT: v_or3_b32 v1, v1, v2, 0 ; GISEL-NEXT: .LBB2_14: ; %Flow5 @@ -833,9 +833,8 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 ; SDAG-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc -; SDAG-NEXT: v_sub_u32_e32 v7, 0x80, v8 -; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v8 -; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v7 +; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v8 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v6 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -846,20 +845,21 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; SDAG-NEXT: ; implicit-def: $vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: v_sub_u32_e32 v7, 0x7f, v8 +; SDAG-NEXT: s_xor_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB3_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v7 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v6 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v7 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v6 ; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc ; SDAG-NEXT: s_cbranch_execz .LBB3_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default @@ -929,13 +929,13 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; SDAG-NEXT: v_alignbit_b32 v9, v2, v1, 3 -; SDAG-NEXT: v_mov_b32_e32 v6, v7 +; SDAG-NEXT: v_mov_b32_e32 v7, v6 ; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB3_13: ; %Flow4 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v9 -; SDAG-NEXT: v_lshl_or_b32 v0, v6, 20, v0 +; SDAG-NEXT: v_lshl_or_b32 v0, v7, 20, v0 ; SDAG-NEXT: v_add_u32_e32 v5, 0x3ff00000, v0 ; SDAG-NEXT: .LBB3_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -966,9 +966,8 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 ; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc -; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v8 -; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v8 -; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v7 +; GISEL-NEXT: v_sub_u32_e32 v6, 0x80, v8 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v6 ; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -979,19 +978,20 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc -; GISEL-NEXT: ; implicit-def: $vgpr7 +; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v8 +; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB3_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock -; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v7 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v6 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB3_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v7 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v6 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc ; GISEL-NEXT: s_cbranch_execz .LBB3_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default @@ -1074,13 +1074,13 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 3, v1 ; GISEL-NEXT: v_or_b32_e32 v9, v0, v2 -; GISEL-NEXT: v_mov_b32_e32 v6, v7 +; GISEL-NEXT: v_mov_b32_e32 v7, v6 ; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: .LBB3_13: ; %Flow4 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff00000 -; GISEL-NEXT: v_lshl_add_u32 v0, v6, 20, v0 +; GISEL-NEXT: v_lshl_add_u32 v0, v7, 20, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xfffff, v9 ; GISEL-NEXT: v_or3_b32 v5, v1, v0, 0 ; GISEL-NEXT: .LBB3_14: ; %Flow5 @@ -1124,30 +1124,30 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; SDAG-NEXT: v_add_u32_e32 v6, 64, v6 ; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc -; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7 -; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 -; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x80, v7 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else -; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v7 -; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v7 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc -; SDAG-NEXT: ; implicit-def: $vgpr6 +; SDAG-NEXT: ; implicit-def: $vgpr2 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: ; implicit-def: $vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v7 +; SDAG-NEXT: s_xor_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB4_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v2 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB4_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v2 ; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc ; SDAG-NEXT: s_cbranch_execz .LBB4_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default @@ -1206,13 +1206,13 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 3 -; SDAG-NEXT: v_mov_b32_e32 v2, v6 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB4_13: ; %Flow4 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3 -; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 +; SDAG-NEXT: v_lshl_add_u32 v1, v6, 23, 1.0 ; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v8 ; SDAG-NEXT: v_or3_b32 v0, v2, v0, v1 ; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0 @@ -1252,9 +1252,8 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 ; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v5 -; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 -; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v8 +; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7 ; GISEL-NEXT: ; implicit-def: $vgpr4 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1263,20 +1262,21 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 ; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: v_sub_u32_e32 v8, 0x7f, v5 +; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB4_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock -; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v8 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB4_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc ; GISEL-NEXT: s_cbranch_execz .LBB4_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default @@ -1340,13 +1340,13 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v7, v8 +; GISEL-NEXT: v_mov_b32_e32 v8, v7 ; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: .LBB4_13: ; %Flow4 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 -; GISEL-NEXT: v_lshl_add_u32 v1, v7, 23, 1.0 +; GISEL-NEXT: v_lshl_add_u32 v1, v8, 23, 1.0 ; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 ; GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v0 @@ -1380,9 +1380,8 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 ; SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; SDAG-NEXT: v_sub_u32_e32 v5, 0x80, v6 -; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 -; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: v_sub_u32_e32 v4, 0x80, v6 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v4 ; SDAG-NEXT: ; implicit-def: $vgpr7 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1391,20 +1390,21 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc -; SDAG-NEXT: ; implicit-def: $vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr4 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: ; implicit-def: $vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: v_sub_u32_e32 v5, 0x7f, v6 +; SDAG-NEXT: s_xor_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB5_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v4 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB5_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v4 ; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc ; SDAG-NEXT: s_cbranch_execz .LBB5_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default @@ -1463,13 +1463,13 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 3 -; SDAG-NEXT: v_mov_b32_e32 v4, v5 +; SDAG-NEXT: v_mov_b32_e32 v5, v4 ; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB5_13: ; %Flow4 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v7 -; SDAG-NEXT: v_lshl_or_b32 v0, v4, 23, v0 +; SDAG-NEXT: v_lshl_or_b32 v0, v5, 23, v0 ; SDAG-NEXT: v_add_u32_e32 v0, 1.0, v0 ; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0 ; SDAG-NEXT: .LBB5_14: ; %Flow5 @@ -1499,9 +1499,8 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 ; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5 -; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 -; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7 +; GISEL-NEXT: v_sub_u32_e32 v6, 0x80, v5 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v6 ; GISEL-NEXT: ; implicit-def: $vgpr4 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1510,20 +1509,21 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GISEL-NEXT: ; implicit-def: $vgpr7 +; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 ; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 +; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB5_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock -; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7 +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v6 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB5_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc ; GISEL-NEXT: s_cbranch_execz .LBB5_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default @@ -1587,12 +1587,12 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v6, v7 +; GISEL-NEXT: v_mov_b32_e32 v7, v6 ; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: .LBB5_13: ; %Flow4 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_lshl_add_u32 v0, v6, 23, 1.0 +; GISEL-NEXT: v_lshl_add_u32 v0, v7, 23, 1.0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GISEL-NEXT: v_and_or_b32 v0, v4, v1, v0 ; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll index 5cb3ca0b80b66..315f95ef15d42 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll @@ -67,7 +67,6 @@ main_body: } ; There are 8 pseudo registers defined to track LDS DMA dependencies. -; When exhausted we default to vmcnt(0). ; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays: ; GCN-COUNT-10: buffer_load_dword @@ -86,7 +85,6 @@ main_body: ; GCN: s_waitcnt vmcnt(2) ; GCN-NOT: s_waitcnt vmcnt ; GCN: ds_read_b32 -; GCN: s_waitcnt vmcnt(0) ; GCN: ds_read_b32 define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) { main_body: @@ -151,4 +149,29 @@ main_body: ret void } +define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) { +; GFX9-LABEL: global_load_lds_no_alias_ds_read: +; GFX9: global_load_dword +; GFX9: global_load_dword +; GFX9: s_waitcnt vmcnt(1) +; GFX9-NOT: s_waitcnt vmcnt(0) +; GFX9: ds_read_b32 +; GFX9: s_waitcnt vmcnt(0) +; GFX9: ds_read_b32 +; GFX9: s_endpgm +body: + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0) + call void @llvm.amdgcn.s.waitcnt(i32 3953) + %gep.0 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i1 + %val.0 = load float, ptr addrspace(3) %gep.0, align 4 + call void @llvm.amdgcn.s.waitcnt(i32 3952) + %gep.1 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2 + %val.1 = load float, ptr addrspace(3) %gep.1, align 4 + %tmp = insertelement <2 x float> poison, float %val.0, i32 0 + %res = insertelement <2 x float> %tmp, float %val.1, i32 1 + store <2 x float> %res, ptr addrspace(1) %out + ret void +} + declare void @llvm.amdgcn.wave.barrier() diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll index d23dee1f02f09..d76b6b2123f56 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll @@ -25,17 +25,17 @@ define half @raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(< ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -78,17 +78,17 @@ define <2 x half> @raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_s ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -131,17 +131,17 @@ define <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_s ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -188,17 +188,17 @@ define half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(< ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -241,17 +241,17 @@ define <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_s ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll index bdcb77201714a..ba322032c8f16 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll @@ -25,17 +25,17 @@ define float @raw_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset( ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -78,17 +78,17 @@ define <2 x float> @raw_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_ ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -134,17 +134,17 @@ define <3 x float> @raw_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_ ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -192,17 +192,17 @@ define <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_ ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -253,17 +253,17 @@ define float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset( ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -306,17 +306,17 @@ define <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_ ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll index 28059db0bede3..0d110dee2f240 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll @@ -25,17 +25,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i3 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -79,17 +79,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(<4 x i3 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -133,17 +133,17 @@ define float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i3 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -187,17 +187,17 @@ define float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(<4 x i3 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -241,17 +241,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -295,17 +295,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -349,17 +349,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -403,17 +403,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_dlc ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -457,17 +457,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_dlc ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -511,17 +511,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_slc ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -565,17 +565,17 @@ define <2 x float> @raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -621,17 +621,17 @@ define <3 x float> @raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -679,17 +679,17 @@ define <4 x float> @raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -739,17 +739,17 @@ define half @raw_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -793,17 +793,17 @@ define <2 x half> @raw_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset( ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -846,17 +846,17 @@ define <4 x half> @raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset( ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -902,17 +902,17 @@ define float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -957,17 +957,17 @@ define float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1012,17 +1012,17 @@ define float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1067,17 +1067,17 @@ define float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1123,17 +1123,17 @@ define half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1178,17 +1178,17 @@ define float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1232,17 +1232,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1284,17 +1284,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1337,17 +1337,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1390,17 +1390,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1444,17 +1444,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1500,17 +1500,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1553,12 +1553,12 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -1602,12 +1602,12 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -1654,17 +1654,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1710,17 +1710,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1766,17 +1766,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1823,17 +1823,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1880,17 +1880,17 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll index ed5fa05fa8ed3..c443e6a94e1bb 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll @@ -25,17 +25,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -77,17 +77,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -130,17 +130,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -188,17 +188,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -247,17 +247,17 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -299,12 +299,12 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -348,12 +348,12 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -398,17 +398,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -452,17 +452,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -508,17 +508,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -571,17 +571,17 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll index e38de72e1f0f1..8f7ada6d785b3 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll @@ -25,17 +25,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -77,17 +77,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -135,17 +135,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -195,17 +195,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -257,17 +257,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -319,17 +319,17 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -376,12 +376,12 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -430,12 +430,12 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -485,17 +485,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -544,17 +544,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -605,17 +605,17 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -672,17 +672,17 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll index 39c58f8f39d59..7707706fbcda7 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll @@ -26,17 +26,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(< ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -80,17 +80,17 @@ define void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset(< ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -134,17 +134,17 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(< ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -188,17 +188,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(< ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -242,17 +242,17 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(< ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -295,17 +295,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -348,17 +348,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_s ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -401,17 +401,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -454,17 +454,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_d ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -507,17 +507,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_s ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -560,17 +560,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -613,17 +613,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -671,17 +671,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -731,17 +731,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -793,17 +793,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -846,17 +846,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -900,17 +900,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -955,17 +955,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1008,17 +1008,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1066,17 +1066,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1124,17 +1124,17 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1176,17 +1176,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4095 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1229,17 +1229,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4096 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1282,17 +1282,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1336,17 +1336,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1392,17 +1392,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1445,12 +1445,12 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -1494,12 +1494,12 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -1544,17 +1544,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1598,17 +1598,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1654,17 +1654,17 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1711,17 +1711,17 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_o ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1766,17 +1766,17 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_o ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll index 5b8bd1f60233b..31225a39e9abd 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll @@ -35,17 +35,17 @@ define half @raw_ptr_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -98,17 +98,17 @@ define <2 x half> @raw_ptr_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sg ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -161,17 +161,17 @@ define <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sg ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -228,17 +228,17 @@ define half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -291,17 +291,17 @@ define <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sg ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll index 7dabd9a395746..750284aef47b9 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll @@ -35,17 +35,17 @@ define float @raw_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -98,17 +98,17 @@ define <2 x float> @raw_ptr_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -164,17 +164,17 @@ define <3 x float> @raw_ptr_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -232,17 +232,17 @@ define <4 x float> @raw_ptr_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -303,17 +303,17 @@ define float @raw_ptr_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -366,17 +366,17 @@ define <4 x float> @raw_ptr_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll index 1a9f7b1619f4c..fedf7510d04f5 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll @@ -35,17 +35,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -99,17 +99,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -163,17 +163,17 @@ define float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -227,17 +227,17 @@ define float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(ptr ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -291,17 +291,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -355,17 +355,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -419,17 +419,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -483,17 +483,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -547,17 +547,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -611,17 +611,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -675,17 +675,17 @@ define <2 x float> @raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_sof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -741,17 +741,17 @@ define <3 x float> @raw_ptr_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_sof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -809,17 +809,17 @@ define <4 x float> @raw_ptr_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_sof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -879,17 +879,17 @@ define half @raw_ptr_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -943,17 +943,17 @@ define <2 x half> @raw_ptr_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1006,17 +1006,17 @@ define <4 x half> @raw_ptr_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1072,17 +1072,17 @@ define float @raw_ptr_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1137,17 +1137,17 @@ define float @raw_ptr_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1202,17 +1202,17 @@ define float @raw_ptr_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zex ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1267,17 +1267,17 @@ define float @raw_ptr_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sex ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1333,17 +1333,17 @@ define half @raw_ptr_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1398,17 +1398,17 @@ define float @raw_ptr_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1462,17 +1462,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1524,17 +1524,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1587,17 +1587,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1650,17 +1650,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_vof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1714,17 +1714,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1780,17 +1780,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1843,12 +1843,12 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -1902,12 +1902,12 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -1964,17 +1964,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -2030,17 +2030,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -2096,17 +2096,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -2163,17 +2163,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -2230,17 +2230,17 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_vof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll index eada2004161d1..67a2d9789c473 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll @@ -35,17 +35,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -97,17 +97,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -160,17 +160,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -228,17 +228,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -297,17 +297,17 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -359,12 +359,12 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -418,12 +418,12 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -478,17 +478,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -542,17 +542,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -608,17 +608,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -681,17 +681,17 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll index 60db62dc43a61..d70a4b608d7f7 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll @@ -35,17 +35,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -97,17 +97,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -165,17 +165,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -235,17 +235,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -307,17 +307,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -379,17 +379,17 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -446,12 +446,12 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -510,12 +510,12 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -575,17 +575,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -644,17 +644,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -715,17 +715,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -792,17 +792,17 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll index 78e29387b1d40..d53fd6180b696 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll @@ -36,17 +36,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -100,17 +100,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -164,17 +164,17 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -228,17 +228,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -292,17 +292,17 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -355,17 +355,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -418,17 +418,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -481,17 +481,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -544,17 +544,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -607,17 +607,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -670,17 +670,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -733,17 +733,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -801,17 +801,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -871,17 +871,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -943,17 +943,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1006,17 +1006,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1070,17 +1070,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1135,17 +1135,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1198,17 +1198,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1266,17 +1266,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1334,17 +1334,17 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1396,17 +1396,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1459,17 +1459,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1522,17 +1522,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1586,17 +1586,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1652,17 +1652,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1715,12 +1715,12 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -1774,12 +1774,12 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -1834,17 +1834,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1898,17 +1898,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1964,17 +1964,17 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -2031,17 +2031,17 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -2096,17 +2096,17 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll index 24dc4f1b3c0aa..dd72f4e954bc3 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll @@ -34,17 +34,17 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr add ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -97,17 +97,17 @@ define <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -160,17 +160,17 @@ define <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -226,17 +226,17 @@ define half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr add ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -289,17 +289,17 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ptr ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -352,17 +352,17 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ptr ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -415,17 +415,17 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -478,17 +478,17 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ptr ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll index 01dc0328f2d2d..e215afac9b8f2 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll @@ -34,17 +34,17 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ad ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -97,17 +97,17 @@ define <2 x float> @raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -163,17 +163,17 @@ define <3 x float> @raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -231,17 +231,17 @@ define <4 x float> @raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -301,17 +301,17 @@ define float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr ad ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -364,17 +364,17 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(pt ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -427,17 +427,17 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(pt ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -490,17 +490,17 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -553,17 +553,17 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(pt ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll index cd60d5b21faba..14466b82db0ff 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll @@ -35,17 +35,17 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -98,17 +98,17 @@ define void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -166,17 +166,17 @@ define void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -230,17 +230,17 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -294,17 +294,17 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -358,17 +358,17 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -421,17 +421,17 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ha ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -484,17 +484,17 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ha ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -547,17 +547,17 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -610,17 +610,17 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ha ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll index 5c67d82c1e977..8ebd9194592f1 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll @@ -36,17 +36,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -105,17 +105,17 @@ define void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -176,17 +176,17 @@ define void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -249,17 +249,17 @@ define void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -313,17 +313,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -377,17 +377,17 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -441,17 +441,17 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -505,17 +505,17 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -569,17 +569,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(fl ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -633,17 +633,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(fl ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -697,17 +697,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -761,17 +761,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(fl ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -823,17 +823,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -885,17 +885,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -948,17 +948,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1011,17 +1011,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1075,17 +1075,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1141,17 +1141,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1204,12 +1204,12 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -1263,12 +1263,12 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -1325,17 +1325,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1391,17 +1391,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1457,17 +1457,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1524,17 +1524,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1591,17 +1591,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll index bcffca8a3c4fc..4ea8685de15bd 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll @@ -24,17 +24,17 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i3 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -77,17 +77,17 @@ define <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -130,17 +130,17 @@ define <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -186,17 +186,17 @@ define half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i3 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -239,17 +239,17 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -292,17 +292,17 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -345,17 +345,17 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -398,17 +398,17 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll index 51e56a47fc2f2..c7c60a1be3eb8 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll @@ -24,17 +24,17 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -77,17 +77,17 @@ define <2 x float> @raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -133,17 +133,17 @@ define <3 x float> @raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -191,17 +191,17 @@ define <4 x float> @raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -251,17 +251,17 @@ define float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -304,17 +304,17 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -357,17 +357,17 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -410,17 +410,17 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -463,17 +463,17 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll index a1d8acdb4cc53..f2e0c4a3313a2 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll @@ -25,17 +25,17 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -78,17 +78,17 @@ define void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -136,17 +136,17 @@ define void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -190,17 +190,17 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -244,17 +244,17 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -298,17 +298,17 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -351,17 +351,17 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ha ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -404,17 +404,17 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ha ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -457,17 +457,17 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -510,17 +510,17 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ha ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll index 7c0aa26a8a699..3e05d58ca4740 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll @@ -26,17 +26,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -85,17 +85,17 @@ define void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -146,17 +146,17 @@ define void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -209,17 +209,17 @@ define void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -263,17 +263,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -317,17 +317,17 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -371,17 +371,17 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -425,17 +425,17 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -479,17 +479,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(fl ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -533,17 +533,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(fl ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -587,17 +587,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -641,17 +641,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(fl ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -693,17 +693,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -745,17 +745,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -798,17 +798,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -851,17 +851,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -905,17 +905,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -961,17 +961,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1014,12 +1014,12 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -1063,12 +1063,12 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -1115,17 +1115,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1171,17 +1171,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1227,17 +1227,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1284,17 +1284,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[V_ADD_U32_e64_]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -1341,17 +1341,17 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc ; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll b/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll index bfd97c53522c9..b2f2c31782fe3 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll @@ -18,7 +18,7 @@ define float @llvm_amdgcn_raw_buffer_load_f32(i32 %voffset, i32 %soffset) { ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -52,7 +52,7 @@ define float @llvm_amdgcn_raw_tbuffer_load_f32(i32 %voffset, i32 %soffset) { ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -86,7 +86,7 @@ define <2 x float> @llvm_amdgcn_raw_buffer_load_v2f32(i32 %voffset, i32 %soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -123,7 +123,7 @@ define <2 x float> @llvm_amdgcn_raw_tbuffer_load_v2f32(i32 %voffset, i32 %soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -160,7 +160,7 @@ define <3 x float> @llvm_amdgcn_raw_buffer_load_v3f32(i32 %voffset, i32 %soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -199,7 +199,7 @@ define <3 x float> @llvm_amdgcn_raw_tbuffer_load_v3f32(i32 %voffset, i32 %soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -238,7 +238,7 @@ define <4 x float> @llvm_amdgcn_raw_buffer_load_v4f32(i32 %voffset, i32 %soffset ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -279,7 +279,7 @@ define <4 x float> @llvm_amdgcn_raw_tbuffer_load_v4f32(i32 %voffset, i32 %soffse ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -321,7 +321,7 @@ define void @llvm_amdgcn_raw_buffer_store_f32(float %val, i32 %voffset, i32 %sof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -355,7 +355,7 @@ define void @llvm_amdgcn_raw_tbuffer_store_f32(float %val, i32 %voffset, i32 %so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -394,7 +394,7 @@ define void @llvm_amdgcn_raw_buffer_store_v2f32(<2 x float> %val, i32 %voffset, ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -433,7 +433,7 @@ define void @llvm_amdgcn_raw_tbuffer_store_v2f32(<2 x float> %val, i32 %voffset, ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -474,7 +474,7 @@ define void @llvm_amdgcn_raw_buffer_store_v3f32(<3 x float> %val, i32 %voffset, ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -515,7 +515,7 @@ define void @llvm_amdgcn_raw_tbuffer_store_v3f32(<3 x float> %val, i32 %voffset, ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -558,7 +558,7 @@ define void @llvm_amdgcn_raw_buffer_store_v4f32(<4 x float> %val, i32 %voffset, ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -601,7 +601,7 @@ define void @llvm_amdgcn_raw_tbuffer_store_v4f32(<4 x float> %val, i32 %voffset, ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -636,7 +636,7 @@ define float @llvm_amdgcn_raw_ptr_buffer_load_f32(i32 %voffset, i32 %soffset) { ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -670,7 +670,7 @@ define float @llvm_amdgcn_raw_ptr_tbuffer_load_f32(i32 %voffset, i32 %soffset) { ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -704,7 +704,7 @@ define <2 x float> @llvm_amdgcn_raw_ptr_buffer_load_v2f32(i32 %voffset, i32 %sof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -741,7 +741,7 @@ define <2 x float> @llvm_amdgcn_raw_ptr_tbuffer_load_v2f32(i32 %voffset, i32 %so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -778,7 +778,7 @@ define <3 x float> @llvm_amdgcn_raw_ptr_buffer_load_v3f32(i32 %voffset, i32 %sof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -817,7 +817,7 @@ define <3 x float> @llvm_amdgcn_raw_ptr_tbuffer_load_v3f32(i32 %voffset, i32 %so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -856,7 +856,7 @@ define <4 x float> @llvm_amdgcn_raw_ptr_buffer_load_v4f32(i32 %voffset, i32 %sof ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -897,7 +897,7 @@ define <4 x float> @llvm_amdgcn_raw_ptr_tbuffer_load_v4f32(i32 %voffset, i32 %so ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -939,7 +939,7 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_f32(float %val, i32 %voffset, i32 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -973,7 +973,7 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_f32(float %val, i32 %voffset, i32 ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -1012,7 +1012,7 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v2f32(<2 x float> %val, i32 %voffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -1051,7 +1051,7 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v2f32(<2 x float> %val, i32 %voff ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -1092,7 +1092,7 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v3f32(<3 x float> %val, i32 %voffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -1133,7 +1133,7 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v3f32(<3 x float> %val, i32 %voff ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -1176,7 +1176,7 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v4f32(<4 x float> %val, i32 %voffs ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} @@ -1219,7 +1219,7 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v4f32(<4 x float> %val, i32 %voff ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GFX908-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY]], implicit $exec ; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/licm-valu.mir b/llvm/test/CodeGen/AMDGPU/licm-valu.mir index 6a28eee19d503..0020e89580a14 100644 --- a/llvm/test/CodeGen/AMDGPU/licm-valu.mir +++ b/llvm/test/CodeGen/AMDGPU/licm-valu.mir @@ -112,7 +112,7 @@ body: | ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[DEF]], implicit $exec + ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[DEF]], implicit $exec ; GCN-NEXT: $exec = S_OR_B64 $exec, 1, implicit-def $scc ; GCN-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec ; GCN-NEXT: S_BRANCH %bb.2 @@ -124,7 +124,7 @@ body: | S_BRANCH %bb.1 bb.1: - %1:sgpr_32 = V_READFIRSTLANE_B32 %0:vgpr_32, implicit $exec + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0:vgpr_32, implicit $exec $exec = S_OR_B64 $exec, 1, implicit-def $scc S_CBRANCH_EXECNZ %bb.1, implicit $exec S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/licm-wwm.mir b/llvm/test/CodeGen/AMDGPU/licm-wwm.mir index fc20674971a71..c3f83d7031621 100644 --- a/llvm/test/CodeGen/AMDGPU/licm-wwm.mir +++ b/llvm/test/CodeGen/AMDGPU/licm-wwm.mir @@ -21,7 +21,7 @@ body: | ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[ENTER_STRICT_WWM1:%[0-9]+]]:sreg_32 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec + ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec ; GCN-NEXT: $exec_lo = EXIT_STRICT_WWM [[ENTER_STRICT_WWM1]] ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[V_READFIRSTLANE_B32_]] ; GCN-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY]], implicit-def $scc @@ -35,8 +35,8 @@ body: | bb.1: %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - %1:sreg_32 = V_READFIRSTLANE_B32 killed %0:vgpr_32, implicit $exec - early-clobber %2:sreg_32 = STRICT_WWM killed %1:sreg_32, implicit $exec + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 killed %0:vgpr_32, implicit $exec + early-clobber %2:sreg_32 = STRICT_WWM killed %1:sreg_32_xm0, implicit $exec $exec_lo = S_OR_B32 $exec_lo, %2, implicit-def $scc S_CBRANCH_EXECNZ %bb.1, implicit $exec S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir index 98b1b69101e51..eaf669da83ead 100644 --- a/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir +++ b/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir @@ -187,8 +187,8 @@ body: | bb.3: successors: %bb.5(0x80000000) - %20:sreg_32 = V_READFIRSTLANE_B32 %10.sub0, implicit $exec - %21:sreg_32 = V_READFIRSTLANE_B32 %10.sub4, implicit $exec + %20:sreg_32_xm0 = V_READFIRSTLANE_B32 %10.sub0, implicit $exec + %21:sreg_32_xm0 = V_READFIRSTLANE_B32 %10.sub4, implicit $exec S_CMP_EQ_U32 %21, %20, implicit-def $scc %22:sreg_32 = S_CSELECT_B32 1, 0, implicit killed $scc %16:vgpr_32 = COPY %22 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index a1947f002dea3..a5607c7a23f33 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -144,12 +144,13 @@ ; GCN-O0-NEXT: Insert required mode register values ; GCN-O0-NEXT: SI Final Branch Preparation ; GCN-O0-NEXT: Post RA hazard recognizer +; GCN-O0-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O0-NEXT: Branch relaxation pass -; GCN-O0-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O0-NEXT: Register Usage Information Collector Pass ; GCN-O0-NEXT: Remove Loads Into Fake Uses ; GCN-O0-NEXT: Live DEBUG_VALUE analysis ; GCN-O0-NEXT: Machine Sanitizer Binary Metadata +; GCN-O0-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O0-NEXT: Machine Optimization Remark Emitter ; GCN-O0-NEXT: Stack Frame Layout Analysis @@ -232,11 +233,9 @@ ; GCN-O1-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; GCN-O1-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-NEXT: Expand reduction intrinsics -; GCN-O1-NEXT: CallGraph Construction -; GCN-O1-NEXT: Call Graph SCC Pass Manager -; GCN-O1-NEXT: AMDGPU Annotate Kernel Features -; GCN-O1-NEXT: FunctionPass Manager -; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments +; GCN-O1-NEXT: FunctionPass Manager +; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O1-NEXT: CallGraph Construction ; GCN-O1-NEXT: Call Graph SCC Pass Manager @@ -427,25 +426,17 @@ ; GCN-O1-NEXT: SI Final Branch Preparation ; GCN-O1-NEXT: SI peephole optimizations ; GCN-O1-NEXT: Post RA hazard recognizer +; GCN-O1-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O1-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-NEXT: Branch relaxation pass -; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O1-NEXT: Register Usage Information Collector Pass ; GCN-O1-NEXT: Remove Loads Into Fake Uses ; GCN-O1-NEXT: Live DEBUG_VALUE analysis ; GCN-O1-NEXT: Machine Sanitizer Binary Metadata +; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Machine Optimization Remark Emitter ; GCN-O1-NEXT: Stack Frame Layout Analysis -<<<<<<< HEAD -; GCN-O1-NEXT: Function register usage analysis -; GCN-O1-NEXT: CallGraph Construction -; GCN-O1-NEXT: Call Graph SCC Pass Manager -; GCN-O1-NEXT: DummyCGSCCPass -; GCN-O1-NEXT: FunctionPass Manager -; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O1-NEXT: Machine Optimization Remark Emitter -======= ; GCN-O1-NEXT: Function register usage analysis ; GCN-O1-OPTS-NEXT:Profile summary info ; GCN-O1-OPTS-NEXT:External Alias Analysis @@ -530,11 +521,9 @@ ; GCN-O1-OPTS-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-OPTS-NEXT: Expand reduction intrinsics ; GCN-O1-OPTS-NEXT: Early CSE -; GCN-O1-OPTS-NEXT: CallGraph Construction -; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager -; GCN-O1-OPTS-NEXT: AMDGPU Annotate Kernel Features -; GCN-O1-OPTS-NEXT: FunctionPass Manager -; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments +; GCN-O1-OPTS-NEXT: FunctionPass Manager +; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O1-OPTS-NEXT: CallGraph Construction ; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager @@ -739,27 +728,18 @@ ; GCN-O1-OPTS-NEXT: SI Final Branch Preparation ; GCN-O1-OPTS-NEXT: SI peephole optimizations ; GCN-O1-OPTS-NEXT: Post RA hazard recognizer +; GCN-O1-OPTS-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O1-OPTS-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-OPTS-NEXT: Branch relaxation pass -; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O1-OPTS-NEXT: Register Usage Information Collector Pass ; GCN-O1-OPTS-NEXT: Remove Loads Into Fake Uses ; GCN-O1-OPTS-NEXT: Live DEBUG_VALUE analysis ; GCN-O1-OPTS-NEXT: Machine Sanitizer Binary Metadata +; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Stack Frame Layout Analysis -<<<<<<< HEAD -; GCN-O1-OPTS-NEXT: Function register usage analysis -; GCN-O1-OPTS-NEXT: CallGraph Construction -; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager -; GCN-O1-OPTS-NEXT: DummyCGSCCPass -; GCN-O1-OPTS-NEXT: FunctionPass Manager -; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter -======= ; GCN-O1-OPTS-NEXT: Function register usage analysis ->>>>>>> c897c13dde3b ([AMDGPU] Convert AMDGPUResourceUsageAnalysis pass from Module to MF pass (#102913)) ; GCN-O1-OPTS-NEXT: AMDGPU Assembly Printer ; GCN-O1-OPTS-NEXT: Free MachineFunction @@ -856,11 +836,9 @@ ; GCN-O2-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O2-NEXT: Expand reduction intrinsics ; GCN-O2-NEXT: Early CSE -; GCN-O2-NEXT: CallGraph Construction -; GCN-O2-NEXT: Call Graph SCC Pass Manager -; GCN-O2-NEXT: AMDGPU Annotate Kernel Features -; GCN-O2-NEXT: FunctionPass Manager -; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments +; GCN-O2-NEXT: FunctionPass Manager +; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O2-NEXT: CallGraph Construction ; GCN-O2-NEXT: Call Graph SCC Pass Manager @@ -1067,27 +1045,18 @@ ; GCN-O2-NEXT: SI Final Branch Preparation ; GCN-O2-NEXT: SI peephole optimizations ; GCN-O2-NEXT: Post RA hazard recognizer +; GCN-O2-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O2-NEXT: AMDGPU Insert Delay ALU ; GCN-O2-NEXT: Branch relaxation pass -; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O2-NEXT: Register Usage Information Collector Pass ; GCN-O2-NEXT: Remove Loads Into Fake Uses ; GCN-O2-NEXT: Live DEBUG_VALUE analysis ; GCN-O2-NEXT: Machine Sanitizer Binary Metadata +; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Machine Optimization Remark Emitter ; GCN-O2-NEXT: Stack Frame Layout Analysis -<<<<<<< HEAD -; GCN-O2-NEXT: Function register usage analysis -; GCN-O2-NEXT: CallGraph Construction -; GCN-O2-NEXT: Call Graph SCC Pass Manager -; GCN-O2-NEXT: DummyCGSCCPass -; GCN-O2-NEXT: FunctionPass Manager -; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O2-NEXT: Machine Optimization Remark Emitter -======= ; GCN-O2-NEXT: Function register usage analysis ->>>>>>> c897c13dde3b ([AMDGPU] Convert AMDGPUResourceUsageAnalysis pass from Module to MF pass (#102913)) ; GCN-O2-NEXT: AMDGPU Assembly Printer ; GCN-O2-NEXT: Free MachineFunction @@ -1197,11 +1166,9 @@ ; GCN-O3-NEXT: Lazy Block Frequency Analysis ; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Global Value Numbering -; GCN-O3-NEXT: CallGraph Construction -; GCN-O3-NEXT: Call Graph SCC Pass Manager -; GCN-O3-NEXT: AMDGPU Annotate Kernel Features -; GCN-O3-NEXT: FunctionPass Manager -; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments +; GCN-O3-NEXT: FunctionPass Manager +; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O3-NEXT: CallGraph Construction ; GCN-O3-NEXT: Call Graph SCC Pass Manager @@ -1408,27 +1375,18 @@ ; GCN-O3-NEXT: SI Final Branch Preparation ; GCN-O3-NEXT: SI peephole optimizations ; GCN-O3-NEXT: Post RA hazard recognizer +; GCN-O3-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O3-NEXT: AMDGPU Insert Delay ALU ; GCN-O3-NEXT: Branch relaxation pass -; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O3-NEXT: Register Usage Information Collector Pass ; GCN-O3-NEXT: Remove Loads Into Fake Uses ; GCN-O3-NEXT: Live DEBUG_VALUE analysis ; GCN-O3-NEXT: Machine Sanitizer Binary Metadata +; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Machine Optimization Remark Emitter ; GCN-O3-NEXT: Stack Frame Layout Analysis -<<<<<<< HEAD -; GCN-O3-NEXT: Function register usage analysis -; GCN-O3-NEXT: CallGraph Construction -; GCN-O3-NEXT: Call Graph SCC Pass Manager -; GCN-O3-NEXT: DummyCGSCCPass -; GCN-O3-NEXT: FunctionPass Manager -; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O3-NEXT: Machine Optimization Remark Emitter -======= ; GCN-O3-NEXT: Function register usage analysis ->>>>>>> c897c13dde3b ([AMDGPU] Convert AMDGPUResourceUsageAnalysis pass from Module to MF pass (#102913)) ; GCN-O3-NEXT: AMDGPU Assembly Printer ; GCN-O3-NEXT: Free MachineFunction diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll index f86c9365d0b79..6fbd5ff80b5cd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll @@ -64,7 +64,6 @@ define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) @@ -84,7 +83,6 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) @@ -102,7 +100,6 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inre ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) @@ -172,7 +169,6 @@ define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsr ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) @@ -192,7 +188,6 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %r ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) @@ -210,7 +205,6 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> i ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.lds.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.lds.err.ll new file mode 100644 index 0000000000000..7679db8d113ea --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.lds.err.ll @@ -0,0 +1,37 @@ +; RUN: split-file %s %t +; +; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 %t/struct.ll 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 %t/struct.ll 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 %t/struct.ptr.ll 2>&1 | FileCheck --ignore-case --check-prefix=LEGALIZER-FAIL %s +; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 %t/struct.ptr.ll 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 %t/raw.ll 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 %t/raw.ll 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 %t/raw.ptr.ll 2>&1 | FileCheck --ignore-case --check-prefix=LEGALIZER-FAIL %s +; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 %t/raw.ptr.ll 2>&1 | FileCheck --ignore-case %s +; +; CHECK: LLVM ERROR: Cannot select +; LEGALIZER-FAIL: Do not know how to expand this operator's operand! + +;--- struct.ll +define amdgpu_ps void @buffer_load_lds(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) { + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +;--- struct.ptr.ll +define amdgpu_ps void @buffer_load_lds(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0) + ret void +} + +;--- raw.ll +define amdgpu_ps void @buffer_load_lds(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) { + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0) + ret void +} + +;--- raw.ptr.ll +define amdgpu_ps void @buffer_load_lds(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll new file mode 100644 index 0000000000000..ff65d5d96cb2c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s + +declare {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64, float, i8, <3 x float>, <3 x float>, i32, <4 x i32>) + +define amdgpu_ps <10 x float> @image_bvh8_intersect_ray(i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, i32 %offset, <4 x i32> inreg %tdescr, ptr addrspace(1) %origin, ptr addrspace(1) %dir) { +; GFX12-SDAG-LABEL: image_bvh8_intersect_ray: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: v_dual_mov_b32 v21, v8 :: v_dual_mov_b32 v20, v7 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v19, v6 :: v_dual_mov_b32 v18, v5 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v16, v3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-SDAG-NEXT: image_bvh8_intersect_ray v[0:9], [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3] +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b96 v[10:11], v[16:18], off +; GFX12-SDAG-NEXT: global_store_b96 v[12:13], v[19:21], off +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: image_bvh8_intersect_ray: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: v_dual_mov_b32 v14, v3 :: v_dual_mov_b32 v15, v4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v16, v5 :: v_dual_mov_b32 v17, v6 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v18, v7 :: v_dual_mov_b32 v19, v8 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-GISEL-NEXT: image_bvh8_intersect_ray v[0:9], [v[0:1], v[2:3], v[14:16], v[17:19], v9], s[0:3] +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b96 v[10:11], v[14:16], off +; GFX12-GISEL-NEXT: global_store_b96 v[12:13], v[17:19], off +; GFX12-GISEL-NEXT: ; return to shader part epilog +main_body: + %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %ray_origin, <3 x float> %ray_dir, i32 %offset, <4 x i32> %tdescr) + %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0 + %r = bitcast <10 x i32> %a to <10 x float> + %o = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1 + store <3 x float> %o, ptr addrspace(1) %origin + %d = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2 + store <3 x float> %d, ptr addrspace(1) %dir + ret <10 x float> %r +} + +define amdgpu_ps <10 x float> @image_bvh8_intersect_ray_1(i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, i32 %offset, <4 x i32> inreg %tdescr, ptr addrspace(1) %origin, ptr addrspace(1) %dir) { +; GFX12-SDAG-LABEL: image_bvh8_intersect_ray_1: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: v_dual_mov_b32 v21, v8 :: v_dual_mov_b32 v20, v7 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v19, v6 :: v_dual_mov_b32 v18, v5 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v16, v3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX12-SDAG-NEXT: image_bvh8_intersect_ray v[0:9], [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3] +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b96 v[10:11], v[16:18], off +; GFX12-SDAG-NEXT: global_store_b96 v[12:13], v[19:21], off +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: image_bvh8_intersect_ray_1: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: v_dual_mov_b32 v14, v3 :: v_dual_mov_b32 v15, v4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v16, v5 :: v_dual_mov_b32 v17, v6 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v18, v7 :: v_dual_mov_b32 v19, v8 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 1 +; GFX12-GISEL-NEXT: image_bvh8_intersect_ray v[0:9], [v[0:1], v[2:3], v[14:16], v[17:19], v9], s[0:3] +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b96 v[10:11], v[14:16], off +; GFX12-GISEL-NEXT: global_store_b96 v[12:13], v[17:19], off +; GFX12-GISEL-NEXT: ; return to shader part epilog +main_body: + %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 1, <3 x float> %ray_origin, <3 x float> %ray_dir, i32 %offset, <4 x i32> %tdescr) + %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0 + %r = bitcast <10 x i32> %a to <10 x float> + %o = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1 + store <3 x float> %o, ptr addrspace(1) %origin + %d = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2 + store <3 x float> %d, ptr addrspace(1) %dir + ret <10 x float> %r +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll new file mode 100644 index 0000000000000..e504eb7a5a124 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --global-isel=0 -mtriple=amdgcn -mcpu=tahiti %s -o - | FileCheck %s +; RUN: llc --global-isel=1 -mtriple=amdgcn -mcpu=tahiti %s -o - | FileCheck %s +; RUN: llc --global-isel=0 -mtriple=amdgcn -mcpu=tonga %s -o - | FileCheck %s +; RUN: llc --global-isel=1 -mtriple=amdgcn -mcpu=tonga %s -o - | FileCheck %s +; RUN: llc --global-isel=0 -mtriple=amdgcn -mcpu=gfx90a %s -o - | FileCheck %s +; RUN: llc --global-isel=1 -mtriple=amdgcn -mcpu=gfx90a %s -o - | FileCheck %s + +declare float @llvm.amdgcn.cvt.off.f32.i4(i32) + +define amdgpu_cs float @cvt_var(i32 %a) { +; CHECK-LABEL: cvt_var: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cvt_off_f32_i4_e32 v0, v0 +; CHECK-NEXT: ; return to shader part epilog + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 %a) + ret float %ret +} + +define amdgpu_cs float @cvt_imm() { +; CHECK-LABEL: cvt_imm: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cvt_off_f32_i4_e32 v0, 4 +; CHECK-NEXT: ; return to shader part epilog + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 4) + ret float %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll index a0ba97d3b639c..71c78372e3976 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll @@ -168,7 +168,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_lo(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_lo: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,0] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false) @@ -179,7 +179,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_lo(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_lo: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,0] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false) @@ -213,7 +213,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_hi(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_hi: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,1] +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -225,7 +225,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_hi(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_hi: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,1] +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -259,7 +259,7 @@ define float @test_cvt_scalef32_f32_fp8_byte1(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 1) ret float %ret @@ -269,7 +269,7 @@ define float @test_cvt_scalef32_f32_fp8_byte2(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 2) ret float %ret @@ -300,7 +300,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_lo(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_lo: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,0] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false) @@ -311,7 +311,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_lo(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_lo: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,0] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false) @@ -345,7 +345,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_hi(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_hi: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,1] +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -357,7 +357,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_hi(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_hi: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,1] +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -391,7 +391,7 @@ define float @test_cvt_scalef32_f32_bf8_byte1(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 1) ret float %ret @@ -401,7 +401,7 @@ define float @test_cvt_scalef32_f32_bf8_byte2(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 2) ret float %ret @@ -601,6 +601,34 @@ define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_word1(<2 x i16> %old, <2 x half> ret <2 x i16> %ret } +define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_imm1(<2 x i16> %old, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_imm1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, 4.0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> %old, <2 x half> , float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_imm2(<2 x i16> %old, float %scale) { +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk_fp8_f16_imm2: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x40004400 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, s0, v1 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk_fp8_f16_imm2: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0x40004400 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, v2, v1 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> %old, <2 x half> , float %scale, i1 false) + ret <2 x i16> %ret +} + define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_word0(<2 x i16> %old, <2 x bfloat> %src, float %scale) { ; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_word0: ; GCN: ; %bb.0: @@ -621,6 +649,27 @@ define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_word1(<2 x i16> %old, <2 x bfloa ret <2 x i16> %ret } +define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_imm1(<2 x i16> %old, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_imm1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp8_bf16 v0, 4.0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> %old, <2 x bfloat> , float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_imm2(<2 x i16> %old, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_imm2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, 0x40004080 +; GCN-NEXT: v_cvt_scalef32_pk_fp8_bf16 v0, s0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> %old, <2 x bfloat> , float %scale, i1 false) + ret <2 x i16> %ret +} + define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_word0(<2 x i16> %old, <2 x half> %src, float %scale) { ; GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_word0: ; GCN: ; %bb.0: @@ -641,6 +690,34 @@ define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_word1(<2 x i16> %old, <2 x half> ret <2 x i16> %ret } +define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_imm1(<2 x i16> %old, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_imm1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, 4.0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> %old, <2 x half> , float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_imm2(<2 x i16> %old, float %scale) { +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk_bf8_f16_imm2: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x40004400 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, s0, v1 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk_bf8_f16_imm2: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0x40004400 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, v2, v1 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> %old, <2 x half> , float %scale, i1 false) + ret <2 x i16> %ret +} + define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_word0(<2 x i16> %old, <2 x bfloat> %src, float %scale) { ; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_word0: ; GCN: ; %bb.0: @@ -661,6 +738,27 @@ define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_word1(<2 x i16> %old, <2 x bfloa ret <2 x i16> %ret } +define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_imm1(<2 x i16> %old, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_imm1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf8_bf16 v0, 4.0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> %old, <2 x bfloat> , float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_imm2(<2 x i16> %old, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_imm2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, 0x40004080 +; GCN-NEXT: v_cvt_scalef32_pk_bf8_bf16 v0, s0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> %old, <2 x bfloat> , float %scale, i1 false) + ret <2 x i16> %ret +} + define <2 x float> @test_cvt_scale_f32_fp4_byte0(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scale_f32_fp4_byte0: ; GCN: ; %bb.0: @@ -675,7 +773,7 @@ define <2 x float> @test_cvt_scale_f32_fp4_byte1(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scale_f32_fp4_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 1) ret <2 x float> %ret @@ -685,7 +783,7 @@ define <2 x float> @test_cvt_scale_f32_fp4_byte2(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scale_f32_fp4_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 2) ret <2 x float> %ret @@ -797,7 +895,7 @@ define <2 x half> @test_cvt_scale_f16_fp4_byte1(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scale_f16_fp4_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 1) ret <2 x half> %ret @@ -807,7 +905,7 @@ define <2 x half> @test_cvt_scale_f16_fp4_byte2(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scale_f16_fp4_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 2) ret <2 x half> %ret @@ -837,7 +935,7 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte1(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scale_bf16_fp4_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 1) ret <2 x bfloat> %ret @@ -847,7 +945,7 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte2(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scale_bf16_fp4_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 2) ret <2 x bfloat> %ret @@ -1236,6 +1334,37 @@ define i32 @test_cvt_scalef32_fp4_f16_byte3(<2 x half> %src0, float %scale, i32 ret i32 %ret } +define i32 @test_cvt_scalef32_fp4_f16_imm1(float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_f16_imm1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, 4.0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> , float %scale, i32 0) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_f16_imm2(float %scale, i32 %old) { +; GFX950-SDAG-LABEL: test_cvt_scalef32_fp4_f16_imm2: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x40004400 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_fp4_f16_imm2: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0x40004400 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v1 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> , float %scale, i32 0) + ret i32 %ret +} + define i32 @test_cvt_scalef32_fp4_bf16_byte0(<2 x bfloat> %src0, float %scale, i32 %old) { ; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte0: ; GCN: ; %bb.0: @@ -1282,3 +1411,1185 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte3(<2 x bfloat> %src0, float %scale, i %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 3) ret i32 %ret } + +define i32 @test_cvt_scalef32_fp4_bf16_imm1(float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_bf16_imm1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, 4.0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> , float %scale, i32 0) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_bf16_imm2(float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_bf16_imm2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, 0x40004080 +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> , float %scale, i32 0) + ret i32 %ret +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_f32_vv_inreg_src(<16 x float> inreg %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v2 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v1 +; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[2:17], v[2:17], v0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f32_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v1 +; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[2:17], v[2:17], v0 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src, <16 x float> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_f32_sl_inreg_src(<16 x float> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[2:7], v[2:17], v[2:17], s16 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f32_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[2:7], v[2:17], v[2:17], v18 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src, <16 x float> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_bf6_f32_vv_inreg_src(<16 x float> inreg %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v2 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v1 +; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[2:17], v[2:17], v0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f32_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v1 +; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[2:17], v[2:17], v0 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src, <16 x float> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_bf6_f32_sl_inreg_src(<16 x float> inreg inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[2:7], v[2:17], v[2:17], s16 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f32_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[2:7], v[2:17], v[2:17], v18 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src, <16 x float> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte0_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte0_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte3_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte3_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte0_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte0_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte3_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte3_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,1,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 true) + ret <2 x half> %ret +} + +define float @test_cvt_scalef32_f32_fp8_byte0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 0) + ret float %ret +} + +define float @test_cvt_scalef32_f32_fp8_byte1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 1) + ret float %ret +} + +define float @test_cvt_scalef32_f32_fp8_byte2_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 2) + ret float %ret +} + +define float @test_cvt_scalef32_f32_fp8_byte3_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[1,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 3) + ret float %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte0_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte0_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte3_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte3_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte0_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte0_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,1,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte3_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte3_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,1,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 true) + ret <2 x half> %ret +} + +define float @test_cvt_scalef32_f32_bf8_byte0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 0) + ret float %ret +} + +define float @test_cvt_scalef32_f32_bf8_byte1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 1) + ret float %ret +} + +define float @test_cvt_scalef32_f32_bf8_byte2_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 2) + ret float %ret +} + +define float @test_cvt_scalef32_f32_bf8_byte3_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 op_sel:[1,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 3) + ret float %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_f32_word0_inreg_src(<2 x i16> inreg %old, float %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_cvt_scalef32_pk_fp8_f32 v3, v0, v1, v2 +; GCN-NEXT: v_mov_b32_e32 v0, v3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_f32_word1_inreg_src(<2 x i16> inreg %old, float %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_cvt_scalef32_pk_fp8_f32 v3, v0, v1, v2 op_sel:[0,0,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 true) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_f32_word0_inreg_src(<2 x i16> inreg %old, float %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_cvt_scalef32_pk_bf8_f32 v3, v0, v1, v2 +; GCN-NEXT: v_mov_b32_e32 v0, v3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_f32_word1_inreg_src(<2 x i16> %old, float inreg %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf8_f32 v0, s0, v1, v2 op_sel:[0,0,0,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 true) + ret <2 x i16> %ret +} + +define <2 x float> @test_cvt_scalef32_pk_f32_fp8_word0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f32_fp8_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp8 v[0:1], s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 %src, float %scale, i1 false) + ret <2 x float> %ret +} + +define <2 x float> @test_cvt_scalef32_pk_f32_fp8_word1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f32_fp8_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp8 v[0:1], s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 %src, float %scale, i1 true) + ret <2 x float> %ret +} + +define <2 x float> @test_cvt_scalef32_pk_f32_bf8_word0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f32_bf8_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_bf8 v[0:1], s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 %src, float %scale, i1 false) + ret <2 x float> %ret +} + +define <2 x float> @test_cvt_scalef32_pk_f32_bf8_word1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f32_bf8_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_bf8 v[0:1], s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 %src, float %scale, i1 true) + ret <2 x float> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_word0_inreg_src(<2 x i16> %old, <2 x half> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, s0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_word1_inreg_src(<2 x i16> %old, <2 x half> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, s0, v1 op_sel:[0,0,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 true) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_word0_inreg_src(<2 x i16> %old, <2 x bfloat> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp8_bf16 v0, s0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_word1_inreg_src(<2 x i16> %old, <2 x bfloat> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp8_bf16 v0, s0, v1 op_sel:[0,0,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 true) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_word0_inreg_src(<2 x i16> %old, <2 x half> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, s0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_word1_inreg_src(<2 x i16> %old, <2 x half> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, s0, v1 op_sel:[0,0,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 true) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_word0_inreg_src(<2 x i16> %old, <2 x bfloat> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf8_bf16 v0, s0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_word1_inreg_src(<2 x i16> %old, <2 x bfloat> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf8_bf16 v0, s0, v1 op_sel:[0,0,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 true) + ret <2 x i16> %ret +} + +define <2 x float> @test_cvt_scale_f32_fp4_byte0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f32_fp4_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 0) + ret <2 x float> %ret +} + +define <2 x float> @test_cvt_scale_f32_fp4_byte1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f32_fp4_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 1) + ret <2 x float> %ret +} + +define <2 x float> @test_cvt_scale_f32_fp4_byte2_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f32_fp4_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 op_sel:[0,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 2) + ret <2 x float> %ret +} + +define <2 x float> @test_cvt_scale_f32_fp4_byte3_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f32_fp4_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 op_sel:[1,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 3) + ret <2 x float> %ret +} + +define i32 @test_cvt_scale_fp4_f32_byte0_inreg_src(i32 %old, float inreg %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scale_fp4_f32_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 0) + ret i32 %ret +} + +define i32 @test_cvt_scale_fp4_f32_byte1_inreg_src(i32 %old, float inreg %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scale_fp4_f32_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 1) + ret i32 %ret +} + +define i32 @test_cvt_scale_fp4_f32_byte2_inreg_src(i32 %old, float inreg %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scale_fp4_f32_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,0,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 2) + ret i32 %ret +} + +define i32 @test_cvt_scale_fp4_f32_byte3_inreg_src(i32 %old, float inreg %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scale_fp4_f32_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 3) + ret i32 %ret +} + +define <2 x half> @test_cvt_scale_f16_fp4_byte0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f16_fp4_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 0) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scale_f16_fp4_byte1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f16_fp4_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 1) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scale_f16_fp4_byte2_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f16_fp4_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 2) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scale_f16_fp4_byte3_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f16_fp4_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 op_sel:[1,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 3) + ret <2 x half> %ret +} + +define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_bf16_fp4_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 0) + ret <2 x bfloat> %ret +} + +define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_bf16_fp4_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 1) + ret <2 x bfloat> %ret +} + +define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte2_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_bf16_fp4_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 2) + ret <2 x bfloat> %ret +} + +define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte3_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_bf16_fp4_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 op_sel:[1,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 3) + ret <2 x bfloat> %ret +} + +define <32 x float> @test_cvt_scale_pk32_f32_fp6_inreg_src(<6 x i32> inreg %src, float %scale) { +; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_fp6_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v39, s17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[34:39], v32 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_fp6_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[34:39], v32 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> %src, float %scale) + ret <32 x float> %ret +} + +define <32 x float> @test_cvt_scale_pk32_f32_bf6_inreg_src(<6 x i32> inreg %src, float %scale) { +; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_bf6_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v39, s17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[34:39], v32 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_bf6_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[34:39], v32 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> %src, float %scale) + ret <32 x float> %ret +} + +define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv_inreg_src(<6 x i32> inreg %src, float %scale) { +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[18:23], v16 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[18:23], v16 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float %scale) + ret <32 x half> %ret +} + +define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl_inreg_src(<6 x i32> inreg inreg %src) { +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], s0 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float 100.0) + ret <32 x half> %ret +} + +define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv_inreg_src(<6 x i32> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v16, v0 +; GCN-NEXT: v_mov_b32_e32 v18, s0 +; GCN-NEXT: v_mov_b32_e32 v19, s1 +; GCN-NEXT: v_mov_b32_e32 v20, s2 +; GCN-NEXT: v_mov_b32_e32 v21, s3 +; GCN-NEXT: v_mov_b32_e32 v22, s16 +; GCN-NEXT: v_mov_b32_e32 v23, s17 +; GCN-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[18:23], v16 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float %scale) + ret <32 x bfloat> %ret +} + +define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl_inreg_src(<6 x i32> inreg inreg %src) { +; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: v_mov_b32_e32 v18, s2 +; GCN-NEXT: v_mov_b32_e32 v19, s3 +; GCN-NEXT: v_mov_b32_e32 v20, s16 +; GCN-NEXT: v_mov_b32_e32 v21, s17 +; GCN-NEXT: s_mov_b32 s0, 0x42c80000 +; GCN-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float 100.0) + ret <32 x bfloat> %ret +} + +define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv_inreg_src(<6 x i32> inreg %src, float %scale) { +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[18:23], v16 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[18:23], v16 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float %scale) + ret <32 x half> %ret +} + +define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl_inreg_src(<6 x i32> inreg inreg %src) { +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], s0 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float 100.0) + ret <32 x half> %ret +} + +define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv_inreg_src(<6 x i32> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v16, v0 +; GCN-NEXT: v_mov_b32_e32 v18, s0 +; GCN-NEXT: v_mov_b32_e32 v19, s1 +; GCN-NEXT: v_mov_b32_e32 v20, s2 +; GCN-NEXT: v_mov_b32_e32 v21, s3 +; GCN-NEXT: v_mov_b32_e32 v22, s16 +; GCN-NEXT: v_mov_b32_e32 v23, s17 +; GCN-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[18:23], v16 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float %scale) + ret <32 x bfloat> %ret +} + +define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl_inreg_src(<6 x i32> inreg inreg %src) { +; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: v_mov_b32_e32 v18, s2 +; GCN-NEXT: v_mov_b32_e32 v19, s3 +; GCN-NEXT: v_mov_b32_e32 v20, s16 +; GCN-NEXT: v_mov_b32_e32 v21, s17 +; GCN-NEXT: s_mov_b32 s0, 0x42c80000 +; GCN-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float 100.0) + ret <32 x bfloat> %ret +} + +define <2 x half> @test_cvt_scalef32_pk_f16_fp8_word0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f16_fp8_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp8 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 %src, float %scale, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_pk_f16_fp8_word1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f16_fp8_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 %src, float %scale, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_pk_f16_bf8_word0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f16_bf8_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_bf8 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 %src, float %scale, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_pk_f16_bf8_word1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f16_bf8_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_bf8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 %src, float %scale, i1 true) + ret <2 x half> %ret +} + +define <2 x bfloat> @test_cvt_scalef32_pk_bf16_fp8_word0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf16_fp8_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp8 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 %src, float %scale, i1 false) + ret <2 x bfloat> %ret +} + +define <2 x bfloat> @test_cvt_scalef32_pk_bf16_fp8_word1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf16_fp8_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 %src, float %scale, i1 true) + ret <2 x bfloat> %ret +} + +define <2 x bfloat> @test_cvt_scalef32_pk_bf16_bf8_word0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf16_bf8_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_bf8 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 %src, float %scale, i1 false) + ret <2 x bfloat> %ret +} + +define <2 x bfloat> @test_cvt_scalef32_pk_bf16_bf8_word1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf16_bf8_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_bf8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 %src, float %scale, i1 true) + ret <2 x bfloat> %ret +} + +define i32 @test_cvt_scalef32_fp4_f16_byte0_inreg_src(<2 x half> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 0) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_f16_byte1_inreg_src(<2 x half> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,0] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 1) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_f16_byte2_inreg_src(<2 x half> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 2) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_f16_byte3_inreg_src(<2 x half> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 3) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_bf16_byte0_inreg_src(<2 x bfloat> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 0) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_bf16_byte1_inreg_src(<2 x bfloat> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,0] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 1) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_bf16_byte2_inreg_src(<2 x bfloat> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 2) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_bf16_byte3_inreg_src(<2 x bfloat> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 3) + ret i32 %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll index 517c87193598d..4e5b85344197a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll @@ -294,3 +294,404 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_f16_sl(<32 x half> inreg %src, ptr store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 ret void } + +define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv_inreg_src(<32 x bfloat> inreg %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_bf16_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], v0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], v0 +; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl_inreg_src(<32 x bfloat> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_bf16_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-GISEL-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], s0 +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_bf6_f16_vv_inreg_src(<32 x half> inreg %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f16_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], v0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f16_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], v0 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_bf6_f16_sl_inreg_src(<32 x half> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f16_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f16_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], v24 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv_inreg_src(<32 x bfloat> inreg %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_bf16_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], v0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], v0 +; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl_inreg_src(<32 x bfloat> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_bf16_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-GISEL-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], s0 +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_f16_vv_inreg_src(<32 x half> inreg %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f16_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], v0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f16_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], v0 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_f16_sl_inreg_src(<32 x half> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f16_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f16_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], v24 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll index 1a42145ab1d81..2776e24379b9d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll @@ -1,11 +1,11 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-SDAG %s -; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-SDAG,GCN-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL,GCN-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-SDAG,GCN-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL,GCN-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,CIPLUS-SDAG,GCN-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,CIPLUS-GISEL,GCN-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s +; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s ; GCN-LABEL: {{^}}ds_append_lds: ; GCN: s_load_dword [[PTR:s[0-9]+]] @@ -35,8 +35,7 @@ define amdgpu_kernel void @ds_append_lds_max_offset(ptr addrspace(3) %lds, ptr a ; GCN-LABEL: {{^}}ds_append_no_fold_offset_si: ; GCN: s_load_dword [[PTR:s[0-9]+]] -; SI: s_add_i32 [[PTR]], [[PTR]], 16 -; SI: s_mov_b32 m0, [[PTR]] +; SI: s_add_i32 m0, [[PTR]], 16 ; SI: ds_append [[RESULT:v[0-9]+]]{{$}} ; CIPLUS: s_mov_b32 m0, [[PTR]] @@ -55,12 +54,8 @@ define amdgpu_kernel void @ds_append_no_fold_offset_si(ptr addrspace(4) %lds.ptr ; GCN-LABEL: {{^}}ds_append_lds_over_max_offset: ; GCN: s_load_dword [[PTR:s[0-9]+]] -; SI-SDAG: s_bitset1_b32 [[PTR]], 16 -; CIPLUS-SDAG: s_add_i32 [[PTR]], [[PTR]], 0x10000 -; GCN-SDAG: s_mov_b32 m0, [[PTR]] - -; SI-GISEL: s_bitset1_b32 m0, 16 -; CIPLUS-GISEL: s_add_u32 m0, [[PTR]], 0x10000 +; SI: s_or_b32 m0, [[PTR]], 0x10000 +; CIPLUSi|u: s_add_{{i|u}}32 m0, [[PTR]], 0x10000 ; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} ; GCN-NOT: buffer_wbinvl1 @@ -73,10 +68,8 @@ define amdgpu_kernel void @ds_append_lds_over_max_offset(ptr addrspace(3) %lds, } ; GCN-LABEL: {{^}}ds_append_lds_vgpr_addr: -; GCN-SDAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; GCN-SDAG: s_mov_b32 m0, [[READLANE]] - -; GCN-GISEL: v_readfirstlane_b32 m0, v0 +; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 +; GCN: s_mov_b32 m0, [[READLANE]] ; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} ; GCN-NOT: buffer_wbinvl1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll new file mode 100644 index 0000000000000..44f5c46954d3b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define amdgpu_gs void @test_ds_bvh_stack_push4_pop1(i32 %addr, i32 %data0, <4 x i32> %data1) { +; CHECK-LABEL: test_ds_bvh_stack_push4_pop1: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: export prim v1, off, off, off done +; CHECK-NEXT: s_endpgm + %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0) + %vdst = extractvalue { i32, i32 } %pair, 0 + %newaddr = extractvalue { i32, i32 } %pair, 1 + call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false) + ret void +} + +define amdgpu_gs void @test_ds_bvh_stack_push4_pop1_1(i32 %addr, i32 %data0, <4 x i32> %data1) { +; CHECK-LABEL: test_ds_bvh_stack_push4_pop1_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1 +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: export prim v1, off, off, off done +; CHECK-NEXT: s_endpgm + %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1) + %vdst = extractvalue { i32, i32 } %pair, 0 + %newaddr = extractvalue { i32, i32 } %pair, 1 + call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false) + ret void +} + +define amdgpu_gs void @test_ds_bvh_stack_push8_pop1(i32 %addr, i32 %data0, <8 x i32> %data1) { +; CHECK-LABEL: test_ds_bvh_stack_push8_pop1: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: export prim v1, off, off, off done +; CHECK-NEXT: s_endpgm + %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0) + %vdst = extractvalue { i32, i32 } %pair, 0 + %newaddr = extractvalue { i32, i32 } %pair, 1 + call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false) + ret void +} + +define amdgpu_gs void @test_ds_bvh_stack_push8_pop1_1(i32 %addr, i32 %data0, <8 x i32> %data1) { +; CHECK-LABEL: test_ds_bvh_stack_push8_pop1_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1 +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: export prim v1, off, off, off done +; CHECK-NEXT: s_endpgm + %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1) + %vdst = extractvalue { i32, i32 } %pair, 0 + %newaddr = extractvalue { i32, i32 } %pair, 1 + call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false) + ret void +} + +define amdgpu_gs void @test_ds_bvh_stack_push8_pop2(i32 %addr, i32 %data0, <8 x i32> %data1, ptr addrspace(1) %out1, ptr addrspace(1) %out2) { +; CHECK-LABEL: test_ds_bvh_stack_push8_pop2: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v0, v1, v[2:9] +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: export prim v1, off, off, off done +; CHECK-NEXT: s_endpgm + %pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0) + %vdst = extractvalue { i64, i32 } %pair, 0 + %newaddr = extractvalue { i64, i32 } %pair, 1 + %vdst.v2i32 = bitcast i64 %vdst to <2 x i32> + %vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0 + %vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1 + call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 poison, i1 true, i1 false) + ret void +} + +define amdgpu_gs void @test_ds_bvh_stack_push8_pop2_1(i32 %addr, i32 %data0, <8 x i32> %data1, ptr addrspace(1) %out1, ptr addrspace(1) %out2) { +; CHECK-LABEL: test_ds_bvh_stack_push8_pop2_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v0, v1, v[2:9] offset:1 +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: export prim v1, off, off, off done +; CHECK-NEXT: s_endpgm + %pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1) + %vdst = extractvalue { i64, i32 } %pair, 0 + %newaddr = extractvalue { i64, i32 } %pair, 1 + %vdst.v2i32 = bitcast i64 %vdst to <2 x i32> + %vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0 + %vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1 + call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 poison, i1 true, i1 false) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll index 2694226ace9e7..5795af702f34f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll @@ -1,11 +1,11 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-SDAG %s -; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,GCN-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,GCN-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,GCN-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,GCN-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,GCN-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,GCN-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s +; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s ; GCN-LABEL: {{^}}ds_consume_lds: ; GCN: s_load_dword [[PTR:s[0-9]+]] @@ -35,8 +35,7 @@ define amdgpu_kernel void @ds_consume_lds_max_offset(ptr addrspace(3) %lds, ptr ; GCN-LABEL: {{^}}ds_consume_no_fold_offset_si: ; GCN: s_load_dword [[PTR:s[0-9]+]] -; SI: s_add_i32 [[PTR]], [[PTR]], 16 -; SI: s_mov_b32 m0, [[PTR]] +; SI: s_add_i32 m0, [[PTR]], 16 ; SI: ds_consume [[RESULT:v[0-9]+]]{{$}} ; CIPLUS: s_mov_b32 m0, [[PTR]] @@ -55,11 +54,8 @@ define amdgpu_kernel void @ds_consume_no_fold_offset_si(ptr addrspace(4) %lds.pt ; GCN-LABEL: {{^}}ds_consume_lds_over_max_offset: ; GCN: s_load_dword [[PTR:s[0-9]+]] -; SI: s_bitset1_b32 [[PTR]], 16 -; CIPLUS-SDAG: s_add_i32 [[PTR]], [[PTR]], 0x10000 -; CIPLUS-GISEL: s_add_u32 [[PTR]], [[PTR]], 0x10000 - -; GCN-SDAG: s_mov_b32 m0, [[PTR]] +; SI: s_or_b32 m0, [[PTR]], 0x10000 +; CIPLUS: s_add_{{i|u}}32 m0, [[PTR]], 0x10000 ; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} ; GCN-NOT: buffer_wbinvl1 ; GCN: {{.*}}store{{.*}} [[RESULT]] @@ -71,9 +67,8 @@ define amdgpu_kernel void @ds_consume_lds_over_max_offset(ptr addrspace(3) %lds, } ; GCN-LABEL: {{^}}ds_consume_lds_vgpr_addr: -; GCN-SDAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; GCN-SDAG: s_mov_b32 m0, [[READLANE]] -; GCN-GISEL: v_readfirstlane_b32 m0, v0 +; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 +; GCN: s_mov_b32 m0, [[READLANE]] ; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} ; GCN-NOT: buffer_wbinvl1 ; GCN: {{.*}}store{{.*}} [[RESULT]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll index ad5e9f4eb6a63..4933d6ac29d76 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -4,12 +4,12 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG,GFX9 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX9 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG,GFX10 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX10 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG,GFX10 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX10 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX9 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX9 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s ; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos. ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -verify-machineinstrs < %s | FileCheck -check-prefix=MIR %s @@ -61,11 +61,7 @@ define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 { ; GCN-LABEL: {{^}}gws_barrier_sgpr_offset: ; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]] -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 - +; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] ; NOLOOP: ds_gws_barrier [[GWS_VAL]] gds{{$}} @@ -78,10 +74,7 @@ define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 { ; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1: ; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]] -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] ; NOLOOP: ds_gws_barrier [[GWS_VAL]] offset:1 gds{{$}} @@ -95,10 +88,7 @@ define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.ba ; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] ; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]] ; NOLOOP: ds_gws_barrier [[GWS_VAL]] gds{{$}} @@ -113,10 +103,7 @@ define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 { ; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] ; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]] ; NOLOOP: ds_gws_barrier [[GWS_VAL]] offset:3 gds{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll index f658ab39f771f..882e6b2fb8c68 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll @@ -4,12 +4,12 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s ; Minimum offset ; GCN-LABEL: {{^}}gws_init_offset0: @@ -55,10 +55,7 @@ define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 { ; GCN-LABEL: {{^}}gws_init_sgpr_offset: ; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]] -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] ; NOLOOP: ds_gws_init [[GWS_VAL]] gds{{$}} @@ -71,10 +68,7 @@ define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 { ; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1: ; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]] -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] ; NOLOOP: ds_gws_init [[GWS_VAL]] offset:1 gds{{$}} @@ -88,10 +82,7 @@ define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) ; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] ; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16 ; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] ; NOLOOP: ds_gws_init v0 gds{{$}} @@ -106,10 +97,7 @@ define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 { ; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] ; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16 ; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] ; NOLOOP: ds_gws_init v0 offset:3 gds{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll new file mode 100644 index 0000000000000..7e22d60cd710f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s + +declare {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh.dual.intersect.ray(i64, float, i8, <3 x float>, <3 x float>, <2 x i32>, <4 x i32>) + +; ERR: in function image_bvh_dual_intersect_ray{{.*}}intrinsic not supported on subtarget +define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray(i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, <2 x i32> %offsets, <4 x i32> inreg %tdescr, ptr addrspace(1) %origin, ptr addrspace(1) %dir) { +; GFX12-SDAG-LABEL: image_bvh_dual_intersect_ray: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-SDAG-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[17:19], v[20:22], v[9:10]], s[0:3] +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b96 v[11:12], v[17:19], off +; GFX12-SDAG-NEXT: global_store_b96 v[13:14], v[20:22], off +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: image_bvh_dual_intersect_ray: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: v_dual_mov_b32 v15, v3 :: v_dual_mov_b32 v16, v4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v17, v5 :: v_dual_mov_b32 v18, v6 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v19, v7 :: v_dual_mov_b32 v20, v8 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-GISEL-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[15:17], v[18:20], v[9:10]], s[0:3] +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b96 v[11:12], v[15:17], off +; GFX12-GISEL-NEXT: global_store_b96 v[13:14], v[18:20], off +; GFX12-GISEL-NEXT: ; return to shader part epilog +main_body: + %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh.dual.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %ray_origin, <3 x float> %ray_dir, <2 x i32> %offsets, <4 x i32> %tdescr) + %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0 + %r = bitcast <10 x i32> %a to <10 x float> + %o = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1 + store <3 x float> %o, ptr addrspace(1) %origin + %d = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2 + store <3 x float> %d, ptr addrspace(1) %dir + ret <10 x float> %r +} + +define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray_1(i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, <2 x i32> %offsets, <4 x i32> inreg %tdescr, ptr addrspace(1) %origin, ptr addrspace(1) %dir) { +; GFX12-SDAG-LABEL: image_bvh_dual_intersect_ray_1: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX12-SDAG-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[17:19], v[20:22], v[9:10]], s[0:3] +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b96 v[11:12], v[17:19], off +; GFX12-SDAG-NEXT: global_store_b96 v[13:14], v[20:22], off +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: image_bvh_dual_intersect_ray_1: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: v_dual_mov_b32 v15, v3 :: v_dual_mov_b32 v16, v4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v17, v5 :: v_dual_mov_b32 v18, v6 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v19, v7 :: v_dual_mov_b32 v20, v8 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 1 +; GFX12-GISEL-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[15:17], v[18:20], v[9:10]], s[0:3] +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b96 v[11:12], v[15:17], off +; GFX12-GISEL-NEXT: global_store_b96 v[13:14], v[18:20], off +; GFX12-GISEL-NEXT: ; return to shader part epilog +main_body: + %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh.dual.intersect.ray(i64 %node_ptr, float %ray_extent, i8 1, <3 x float> %ray_origin, <3 x float> %ray_dir, <2 x i32> %offsets, <4 x i32> %tdescr) + %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0 + %r = bitcast <10 x i32> %a to <10 x float> + %o = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1 + store <3 x float> %o, ptr addrspace(1) %origin + %d = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2 + store <3 x float> %d, ptr addrspace(1) %dir + ret <10 x float> %r +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll index 93bc7155cbfa4..5c106cd9a341f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll @@ -132,44 +132,31 @@ define amdgpu_kernel void @id_arg_i32(i32 %row) #0 { ; Divergent row number just causes a readfirstlane for now. define amdgpu_kernel void @id_row_i32() #0 { -; GFX11-SDAG-LABEL: id_row_i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 -; GFX11-SDAG-NEXT: s_mov_b32 m0, s0 -; GFX11-SDAG-NEXT: exp pos0 v0, off, off, off done row_en -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: id_row_i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x63 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GFX11-GISEL-NEXT: exp pos0 v1, off, off, off done row_en -; GFX11-GISEL-NEXT: s_endpgm -; -; GFX12-SDAG-LABEL: id_row_i32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 -; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 -; GFX12-SDAG-NEXT: export pos0 v0, off, off, off done row_en -; GFX12-SDAG-NEXT: s_endpgm +; GFX11-LABEL: id_row_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-NEXT: s_mov_b32 m0, s0 +; GFX11-NEXT: exp pos0 v0, off, off, off done row_en +; GFX11-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: id_row_i32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x63 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GFX12-GISEL-NEXT: export pos0 v1, off, off, off done row_en -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: id_row_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX12-NEXT: s_mov_b32 m0, s0 +; GFX12-NEXT: export pos0 v0, off, off, off done row_en +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() call void @llvm.amdgcn.exp.row.i32(i32 12, i32 1, i32 99, i32 undef, i32 undef, i32 undef, i1 true, i32 %id) ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-GISEL: {{.*}} +; GFX11-SDAG: {{.*}} +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.err.ll new file mode 100644 index 0000000000000..383f6c1288d13 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.err.ll @@ -0,0 +1,13 @@ +; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx810 %s 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx810 %s 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --ignore-case %s +; +; CHECK: LLVM ERROR: Cannot select + +declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux) + +define amdgpu_ps void @global_load_lds_dword(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) { + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll index 8f67375a09cb7..8603d7aaa03dc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll @@ -17,20 +17,13 @@ declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr ;--------------------------------------------------------------------- define amdgpu_ps void @global_load_lds_dwordx3_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) { -; GFX950-SDAG-LABEL: global_load_lds_dwordx3_vaddr: -; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_readfirstlane_b32 s0, v2 -; GFX950-SDAG-NEXT: s_mov_b32 m0, s0 -; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0 -; GFX950-SDAG-NEXT: s_endpgm -; -; GFX950-GISEL-LABEL: global_load_lds_dwordx3_vaddr: -; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v2 -; GFX950-GISEL-NEXT: s_nop 4 -; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0 -; GFX950-GISEL-NEXT: s_endpgm +; GFX950-LABEL: global_load_lds_dwordx3_vaddr: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_readfirstlane_b32 s0, v2 +; GFX950-NEXT: s_mov_b32 m0, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0 +; GFX950-NEXT: s_endpgm call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1) ret void } @@ -47,9 +40,10 @@ define amdgpu_ps void @global_load_lds_dwordx3_saddr(ptr addrspace(1) nocapture ; ; GFX950-GISEL-LABEL: global_load_lds_dwordx3_saddr: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX950-GISEL-NEXT: v_readfirstlane_b32 s2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-GISEL-NEXT: s_nop 3 +; GFX950-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v0, s[0:1] offset:32 nt ; GFX950-GISEL-NEXT: s_endpgm call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 32, i32 2) @@ -57,20 +51,13 @@ define amdgpu_ps void @global_load_lds_dwordx3_saddr(ptr addrspace(1) nocapture } define amdgpu_ps void @global_load_lds_dwordx3_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) { -; GFX950-SDAG-LABEL: global_load_lds_dwordx3_saddr_and_vaddr: -; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX950-SDAG-NEXT: s_mov_b32 m0, s2 -; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:48 sc1 -; GFX950-SDAG-NEXT: s_endpgm -; -; GFX950-GISEL-LABEL: global_load_lds_dwordx3_saddr_and_vaddr: -; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GFX950-GISEL-NEXT: s_nop 4 -; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:48 sc1 -; GFX950-GISEL-NEXT: s_endpgm +; GFX950-LABEL: global_load_lds_dwordx3_saddr_and_vaddr: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_readfirstlane_b32 s2, v0 +; GFX950-NEXT: s_mov_b32 m0, s2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:48 sc1 +; GFX950-NEXT: s_endpgm %voffset.64 = zext i32 %voffset to i64 %gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64 call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 12, i32 48, i32 16) @@ -82,20 +69,13 @@ define amdgpu_ps void @global_load_lds_dwordx3_saddr_and_vaddr(ptr addrspace(1) ;--------------------------------------------------------------------- define amdgpu_ps void @global_load_lds_dwordx4_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) { -; GFX950-SDAG-LABEL: global_load_lds_dwordx4_vaddr: -; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_readfirstlane_b32 s0, v2 -; GFX950-SDAG-NEXT: s_mov_b32 m0, s0 -; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0 -; GFX950-SDAG-NEXT: s_endpgm -; -; GFX950-GISEL-LABEL: global_load_lds_dwordx4_vaddr: -; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v2 -; GFX950-GISEL-NEXT: s_nop 4 -; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0 -; GFX950-GISEL-NEXT: s_endpgm +; GFX950-LABEL: global_load_lds_dwordx4_vaddr: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_readfirstlane_b32 s0, v2 +; GFX950-NEXT: s_mov_b32 m0, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0 +; GFX950-NEXT: s_endpgm call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1) ret void } @@ -112,9 +92,10 @@ define amdgpu_ps void @global_load_lds_dwordx4_saddr(ptr addrspace(1) nocapture ; ; GFX950-GISEL-LABEL: global_load_lds_dwordx4_saddr: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX950-GISEL-NEXT: v_readfirstlane_b32 s2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-GISEL-NEXT: s_nop 3 +; GFX950-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v0, s[0:1] offset:32 nt ; GFX950-GISEL-NEXT: s_endpgm call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 32, i32 2) @@ -122,24 +103,15 @@ define amdgpu_ps void @global_load_lds_dwordx4_saddr(ptr addrspace(1) nocapture } define amdgpu_ps void @global_load_lds_dwordx4_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) { -; GFX950-SDAG-LABEL: global_load_lds_dwordx4_saddr_and_vaddr: -; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX950-SDAG-NEXT: s_mov_b32 m0, s2 -; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:48 sc1 -; GFX950-SDAG-NEXT: s_endpgm -; -; GFX950-GISEL-LABEL: global_load_lds_dwordx4_saddr_and_vaddr: -; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GFX950-GISEL-NEXT: s_nop 4 -; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:48 sc1 -; GFX950-GISEL-NEXT: s_endpgm +; GFX950-LABEL: global_load_lds_dwordx4_saddr_and_vaddr: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_readfirstlane_b32 s2, v0 +; GFX950-NEXT: s_mov_b32 m0, s2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:48 sc1 +; GFX950-NEXT: s_endpgm %voffset.64 = zext i32 %voffset to i64 %gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64 call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 16, i32 48, i32 16) ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX950: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll index 7362baf6bab95..510c687e9db1a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll @@ -41,8 +41,9 @@ define amdgpu_ps void @global_load_lds_dword_vaddr(ptr addrspace(1) nocapture %g ; ; GFX900-GISEL-LABEL: global_load_lds_dword_vaddr: ; GFX900-GISEL: ; %bb.0: ; %main_body -; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v2 -; GFX900-GISEL-NEXT: s_nop 4 +; GFX900-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX900-GISEL-NEXT: s_mov_b32 m0, s0 +; GFX900-GISEL-NEXT: s_nop 0 ; GFX900-GISEL-NEXT: global_load_dword v[0:1], off offset:16 glc lds ; GFX900-GISEL-NEXT: s_endpgm main_body: @@ -88,9 +89,10 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) nocapture in ; ; GFX900-GISEL-LABEL: global_load_lds_dword_saddr: ; GFX900-GISEL: ; %bb.0: ; %main_body -; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX900-GISEL-NEXT: v_readfirstlane_b32 s2, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-GISEL-NEXT: s_nop 3 +; GFX900-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX900-GISEL-NEXT: s_nop 0 ; GFX900-GISEL-NEXT: global_load_dword v0, s[0:1] offset:32 slc lds ; GFX900-GISEL-NEXT: s_endpgm main_body: @@ -132,8 +134,9 @@ define amdgpu_ps void @global_load_lds_dword_saddr_and_vaddr(ptr addrspace(1) no ; ; GFX900-GISEL-LABEL: global_load_lds_dword_saddr_and_vaddr: ; GFX900-GISEL: ; %bb.0: ; %main_body -; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GFX900-GISEL-NEXT: s_nop 4 +; GFX900-GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX900-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX900-GISEL-NEXT: s_nop 0 ; GFX900-GISEL-NEXT: global_load_dword v1, s[0:1] offset:48 lds ; GFX900-GISEL-NEXT: s_endpgm main_body: @@ -177,8 +180,9 @@ define amdgpu_ps void @global_load_lds_ushort_vaddr(ptr addrspace(1) nocapture % ; ; GFX900-GISEL-LABEL: global_load_lds_ushort_vaddr: ; GFX900-GISEL: ; %bb.0: ; %main_body -; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v2 -; GFX900-GISEL-NEXT: s_nop 4 +; GFX900-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX900-GISEL-NEXT: s_mov_b32 m0, s0 +; GFX900-GISEL-NEXT: s_nop 0 ; GFX900-GISEL-NEXT: global_load_ushort v[0:1], off lds ; GFX900-GISEL-NEXT: s_endpgm main_body: @@ -220,8 +224,9 @@ define amdgpu_ps void @global_load_lds_ubyte_vaddr(ptr addrspace(1) nocapture %g ; ; GFX900-GISEL-LABEL: global_load_lds_ubyte_vaddr: ; GFX900-GISEL: ; %bb.0: ; %main_body -; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v2 -; GFX900-GISEL-NEXT: s_nop 4 +; GFX900-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX900-GISEL-NEXT: s_mov_b32 m0, s0 +; GFX900-GISEL-NEXT: s_nop 0 ; GFX900-GISEL-NEXT: global_load_ubyte v[0:1], off lds ; GFX900-GISEL-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index 9e6a85dd2810d..aad6e031aa9ed 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -1219,14 +1219,14 @@ body: | %84:vgpr_32 = COPY %3347 %86:vgpr_32 = COPY %3347:vgpr_32 IGLP_OPT 2 - %593:sreg_32 = V_READFIRSTLANE_B32 %11:vgpr_32, implicit $exec - %595:vgpr_32 = V_LSHL_ADD_U32_e64 %593:sreg_32, 4, %3329:vgpr_32, implicit $exec + %593:sreg_32_xm0 = V_READFIRSTLANE_B32 %11:vgpr_32, implicit $exec + %595:vgpr_32 = V_LSHL_ADD_U32_e64 %593:sreg_32_xm0, 4, %3329:vgpr_32, implicit $exec %597:vgpr_32 = nsw V_MUL_LO_U32_e64 %595:vgpr_32, %1.sub6:sgpr_512, implicit $exec %599:vgpr_32 = V_ADD_LSHL_U32_e64 %597:vgpr_32, %16:vgpr_32, 1, implicit $exec %601:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec %602:vgpr_32 = V_ADD_U32_e32 %18:sreg_32, %599:vgpr_32, implicit $exec %603:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %602:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec - %605:sreg_32 = S_LSHL_B32 %593:sreg_32, 7, implicit-def dead $scc + %605:sreg_32 = S_LSHL_B32 %593:sreg_32_xm0, 7, implicit-def dead $scc %606:vgpr_32 = V_ADD_LSHL_U32_e64 %25:vgpr_32, %605:sreg_32, 1, implicit $exec DS_WRITE_B128_gfx9 %606:vgpr_32, %601:vreg_128_align2, 0, 0, implicit $exec DS_WRITE_B128_gfx9 %606:vgpr_32, %603:vreg_128_align2, 1024, 0, implicit $exec @@ -1732,7 +1732,7 @@ body: | undef %2978.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub1:vreg_64_align2, %803.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec %3005.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec %2978.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec - %1442:vgpr_32 = V_ADD_U32_e32 %593:sreg_32, %15:vgpr_32, implicit $exec + %1442:vgpr_32 = V_ADD_U32_e32 %593:sreg_32_xm0, %15:vgpr_32, implicit $exec %1444:vgpr_32 = V_AND_B32_e32 536870911, %1442:vgpr_32, implicit $exec %1446:vgpr_32 = nsw V_MUL_LO_U32_e64 %1444:vgpr_32, %494:sreg_32, implicit $exec %1447:vgpr_32 = V_ADD_LSHL_U32_e64 %47:vgpr_32, %1446:vgpr_32, 1, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index 2707c2209e7c9..0887fdf0844b0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -541,11 +541,11 @@ body: | %259:vreg_512_align2 = IMPLICIT_DEF %260:vreg_512_align2 = IMPLICIT_DEF IGLP_OPT 2 - %27:sreg_32 = V_READFIRSTLANE_B32 %2:vgpr_32, implicit $exec - %28:vgpr_32 = V_LSHL_ADD_U32_e64 %27:sreg_32, 4, %29:vgpr_32, implicit $exec + %27:sreg_32_xm0 = V_READFIRSTLANE_B32 %2:vgpr_32, implicit $exec + %28:vgpr_32 = V_LSHL_ADD_U32_e64 %27:sreg_32_xm0, 4, %29:vgpr_32, implicit $exec %30:vreg_64_align2, dead %31:sreg_64 = V_MAD_U64_U32_e64 %3:sreg_32, %28:vgpr_32, %4:vreg_64_align2, 0, implicit $exec %32:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %30.sub0:vreg_64_align2, %5:sgpr_128, 0, 0, 0, 0, implicit $exec - %33:sreg_32 = S_LSHL_B32 %27:sreg_32, 7, implicit-def dead $scc + %33:sreg_32 = S_LSHL_B32 %27:sreg_32_xm0, 7, implicit-def dead $scc %34:vgpr_32 = V_ADD_LSHL_U32_e64 %6:vgpr_32, %33:sreg_32, 1, implicit $exec DS_WRITE_B128_gfx9 %34:vgpr_32, %32:vreg_128_align2, 0, 0, implicit $exec %35:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %30.sub0:vreg_64_align2, %5:sgpr_128, 0, 64, 0, 0, implicit $exec @@ -777,7 +777,7 @@ body: | %249:vgpr_32 = V_PERM_B32_e64 %40.sub0:vreg_64_align2, %38.sub0:vreg_64_align2, %15:sreg_32, implicit $exec %250:vgpr_32 = V_PERM_B32_e64 %40.sub1:vreg_64_align2, %38.sub1:vreg_64_align2, %14:sreg_32, implicit $exec %251:vgpr_32 = V_PERM_B32_e64 %40.sub1:vreg_64_align2, %38.sub1:vreg_64_align2, %15:sreg_32, implicit $exec - %252:vgpr_32 = V_ADD_U32_e32 %27:sreg_32, %16:vgpr_32, implicit $exec + %252:vgpr_32 = V_ADD_U32_e32 %27:sreg_32_xm0, %16:vgpr_32, implicit $exec %253:vgpr_32 = V_AND_B32_e32 536870911, %252:vgpr_32, implicit $exec %254:vgpr_32 = nsw V_MUL_LO_U32_e64 %253:vgpr_32, %17:sreg_32, implicit $exec %255:vgpr_32 = V_ADD_LSHL_U32_e64 %18:vgpr_32, %254:vgpr_32, 1, implicit $exec @@ -897,4 +897,3 @@ body: | %29:vgpr_32 = nuw V_ADD_U32_e32 64, %29:vgpr_32, implicit $exec S_ENDPGM 0 ... - diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll index 353f4d90cad1f..990a6066adcd3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll @@ -26,7 +26,6 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GISEL12-NEXT: v_add_nc_u32_e32 v11, 32, v12 ; GISEL12-NEXT: s_mov_b32 exec_lo, s5 -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_setpc_b64 s[6:7] ; ; DAGISEL12-LABEL: basic: @@ -50,7 +49,6 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 ; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; DAGISEL12-NEXT: v_add_nc_u32_e32 v11, 32, v12 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5 -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_setpc_b64 s[6:7] ; ; GISEL10-LABEL: basic: @@ -123,8 +121,9 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0 +; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s8 ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -132,7 +131,6 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; GISEL12-NEXT: ; %bb.2: ; %tail ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GISEL12-NEXT: s_mov_b32 exec_lo, s5 -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_setpc_b64 s[6:7] ; ; DAGISEL12-LABEL: wwm_in_shader: @@ -159,7 +157,6 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; DAGISEL12-NEXT: ; %bb.2: ; %tail ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5 -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_setpc_b64 s[6:7] ; ; GISEL10-LABEL: wwm_in_shader: @@ -244,8 +241,9 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v12, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0 +; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s8 ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -253,7 +251,6 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; GISEL12-NEXT: ; %bb.2: ; %tail ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GISEL12-NEXT: s_mov_b32 exec_lo, s5 -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_setpc_b64 s[6:7] ; ; DAGISEL12-LABEL: phi_whole_struct: @@ -279,7 +276,6 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; DAGISEL12-NEXT: ; %bb.2: ; %tail ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5 -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_setpc_b64 s[6:7] ; ; GISEL10-LABEL: phi_whole_struct: @@ -367,8 +363,9 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call ; GISEL12-NEXT: s_or_saveexec_b32 s8, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v1, s8 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s9, 0, v0 +; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s9 ; GISEL12-NEXT: s_mov_b32 exec_lo, s8 ; GISEL12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v13, v1 @@ -406,7 +403,6 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call ; GISEL12-NEXT: ; %bb.8: ; %tail.end ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GISEL12-NEXT: s_mov_b32 exec_lo, s5 -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_setpc_b64 s[6:7] ; ; DAGISEL12-LABEL: control_flow: @@ -465,7 +461,6 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call ; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5 -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_setpc_b64 s[6:7] ; ; GISEL10-LABEL: control_flow: @@ -619,8 +614,9 @@ define amdgpu_cs_chain void @use_v0_7(<3 x i32> inreg %sgpr, ptr inreg %callee, ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v13, 0x47, v12, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v13 +; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v13, s8 ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -632,7 +628,6 @@ define amdgpu_cs_chain void @use_v0_7(<3 x i32> inreg %sgpr, ptr inreg %callee, ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GISEL12-NEXT: s_mov_b32 exec_lo, s5 -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_setpc_b64 s[6:7] ; ; DAGISEL12-LABEL: use_v0_7: @@ -663,7 +658,6 @@ define amdgpu_cs_chain void @use_v0_7(<3 x i32> inreg %sgpr, ptr inreg %callee, ; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5 -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_setpc_b64 s[6:7] ; ; GISEL10-LABEL: use_v0_7: @@ -786,7 +780,6 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr ; GISEL12-NEXT: v_dual_mov_b32 v12, v36 :: v_dual_mov_b32 v13, v37 ; GISEL12-NEXT: v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39 ; GISEL12-NEXT: s_wait_kmcnt 0x0 -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GISEL12-NEXT: v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v25, v1 ; GISEL12-NEXT: v_dual_mov_b32 v26, v2 :: v_dual_mov_b32 v27, v3 @@ -861,7 +854,6 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr ; DAGISEL12-NEXT: v_dual_mov_b32 v12, v36 :: v_dual_mov_b32 v13, v37 ; DAGISEL12-NEXT: v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39 ; DAGISEL12-NEXT: s_wait_kmcnt 0x0 -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_swappc_b64 s[30:31], s[0:1] ; DAGISEL12-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 ; DAGISEL12-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v43, v3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll index 1b1c89d9f5ad2..e0a5d397bded4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll @@ -26,8 +26,9 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; GISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11] -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0 +; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s12 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GISEL12-NEXT: v_mov_b32_e32 v1, s13 @@ -40,7 +41,6 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL12-NEXT: s_mov_b64 exec, s[4:5] -; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_setpc_b64 s[8:9] ; ; DAGISEL12-LABEL: basic: @@ -71,7 +71,6 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; DAGISEL12-NEXT: ; %bb.2: ; %tail ; DAGISEL12-NEXT: s_or_b64 exec, exec, s[8:9] ; DAGISEL12-NEXT: s_mov_b64 exec, s[6:7] -; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: s_setpc_b64 s[4:5] ; ; GISEL10-LABEL: basic: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index e592a4ac5e8fa..513ffb38fe7f9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -261,7 +261,7 @@ main_body: ; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs. -define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 @@ -352,9 +352,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 4.0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX12-SDAG-NEXT: flat_load_b32 v9, v[0:1] ; GFX12-SDAG-NEXT: flat_load_b32 v10, v[2:3] @@ -391,6 +392,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_b32 v9, v[0:1] ; GFX12-GISEL-NEXT: flat_load_b32 v10, v[2:3] @@ -423,7 +425,7 @@ main_body: ret void } -define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 @@ -503,9 +505,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX12-SDAG-NEXT: flat_load_b32 v6, v[0:1] ; GFX12-SDAG-NEXT: flat_load_b32 v7, v[2:3] @@ -537,6 +540,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-GISEL-NEXT: flat_load_b32 v6, v[0:1] ; GFX12-GISEL-NEXT: flat_load_b32 v7, v[2:3] @@ -569,7 +573,7 @@ main_body: ret void } -define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_clause 0x1 @@ -730,7 +734,7 @@ main_body: ret void } -define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 8d380516df8b5..452033f332659 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -49,7 +49,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: v_mov_b32_e32 v9, s17 ; GCN-NEXT: v_mov_b32_e32 v10, s18 ; GCN-NEXT: v_mov_b32_e32 v11, s19 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 4 ; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 @@ -122,7 +122,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN-NEXT: v_mov_b32_e32 v9, s17 ; GCN-NEXT: v_mov_b32_e32 v10, s18 ; GCN-NEXT: v_mov_b32_e32 v11, s19 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 4 ; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 @@ -179,7 +179,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -224,7 +224,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0, ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -417,7 +417,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat> ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -459,7 +459,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index d9ee276c3f076..ee41198f0449b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -19,7 +19,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -39,7 +39,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -88,7 +88,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -135,7 +135,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1) @@ -186,7 +186,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 ; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 4 ; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 @@ -253,7 +253,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 4 ; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 @@ -316,7 +316,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 ; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 4 ; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 @@ -383,7 +383,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 4 ; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 @@ -430,7 +430,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -475,7 +475,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -776,7 +776,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -813,7 +813,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -855,7 +855,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -892,7 +892,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -919,7 +919,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -939,7 +939,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -971,7 +971,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -992,7 +992,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0) @@ -1022,7 +1022,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -1043,7 +1043,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 3, i32 2, i32 1) @@ -1097,7 +1097,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 ; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 @@ -1169,7 +1169,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 4 ; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 @@ -1233,7 +1233,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 ; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 @@ -1305,7 +1305,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 4 ; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 @@ -1352,7 +1352,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1397,7 +1397,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1717,7 +1717,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -1754,7 +1754,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -1801,7 +1801,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -1838,7 +1838,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -1865,7 +1865,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1885,7 +1885,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1913,7 +1913,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-NEXT: v_accvgpr_write_b32 a3, s3 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; GCN-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) @@ -1939,7 +1939,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; GCN-NEXT: v_accvgpr_write_b32 a3, s3 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; GCN-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 9a8282231ac15..a53581cab8b01 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -24,7 +24,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -46,9 +46,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -70,9 +70,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -94,9 +94,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -118,9 +118,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -142,9 +142,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -166,9 +166,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -190,9 +190,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -217,7 +217,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -242,7 +242,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -267,7 +267,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -292,7 +292,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -317,7 +317,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -342,7 +342,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -367,7 +367,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -392,7 +392,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -417,7 +417,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -442,7 +442,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -467,7 +467,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -492,7 +492,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -518,7 +518,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -543,7 +543,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -567,7 +567,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -592,7 +592,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -617,7 +617,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -642,7 +642,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -667,7 +667,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -692,7 +692,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -717,7 +717,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -742,7 +742,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -767,7 +767,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -791,7 +791,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -815,7 +815,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -839,7 +839,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -863,7 +863,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -889,7 +889,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -914,7 +914,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -939,7 +939,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -964,7 +964,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -988,7 +988,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1012,7 +1012,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1036,7 +1036,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1060,7 +1060,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1084,7 +1084,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1108,7 +1108,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1132,7 +1132,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1156,7 +1156,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1181,7 +1181,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1206,7 +1206,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1231,7 +1231,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1256,7 +1256,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1280,7 +1280,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1304,7 +1304,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1328,7 +1328,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1352,7 +1352,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1376,7 +1376,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1400,7 +1400,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1425,11 +1425,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a1, v17 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_mov_b32_e32 v16, s1 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1447,10 +1448,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a1, v17 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_mov_b32_e32 v16, s0 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1468,10 +1470,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a1, v17 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_mov_b32_e32 v16, s0 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1512,7 +1515,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1545,7 +1548,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1571,10 +1574,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1596,10 +1600,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1625,10 +1630,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1650,10 +1656,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1679,10 +1686,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1704,10 +1712,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1725,10 +1734,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GCN-NEXT: v_accvgpr_write_b32 a1, s1 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_mov_b32_e32 v17, s16 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, s16 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1754,10 +1764,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a1, s21 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s23 +; SDAG-NEXT: v_mov_b32_e32 v9, s24 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1779,10 +1790,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a1, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s23 +; GISEL-NEXT: v_mov_b32_e32 v9, s24 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1793,22 +1805,43 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v20, -2 +; SDAG-NEXT: v_mov_b32_e32 v21, 33 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, 33 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) ret <4 x float> %result } @@ -1818,14 +1851,15 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v20, -2 +; SDAG-NEXT: v_mov_b32_e32 v21, 0x41 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1840,10 +1874,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1858,15 +1893,15 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v20, 0x4d +; SDAG-NEXT: v_mov_b32_e32 v21, 0x41 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1883,9 +1918,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 ; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1923,11 +1958,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v17, s12 +; SDAG-NEXT: v_mov_b32_e32 v18, s13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v18 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15] ; SDAG-NEXT: s_endpgm ; @@ -1948,12 +1984,13 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s25 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s26 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s29 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) @@ -1966,8 +2003,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: s_movk_i32 s6, 0x41 -; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; SDAG-NEXT: v_mov_b32_e32 v17, -2 +; SDAG-NEXT: v_mov_b32_e32 v18, 0x41 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s8 @@ -1991,18 +2029,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v18, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 -; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] @@ -2017,17 +2056,224 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 ret void } +define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; SDAG-NEXT: v_mov_b32_e32 v17, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v18, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v18, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[6:7] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, 1.0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: s_endpgm + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216) + store <4 x float> %result, ptr addrspace(1) %ptr, align 16 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; SDAG-NEXT: v_mov_b32_e32 v17, -2 +; SDAG-NEXT: v_mov_b32_e32 v18, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v18, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[6:7] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; GISEL-NEXT: v_mov_b32_e32 v16, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: s_endpgm + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2) + store <4 x float> %result, ptr addrspace(1) %ptr, align 16 + ret void +} + +define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 { +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; SDAG-NEXT: v_mov_b32_e32 v17, 0.15915494 +; SDAG-NEXT: v_mov_b32_e32 v18, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v18, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[6:7] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; GISEL-NEXT: v_mov_b32_e32 v16, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v17, 0.15915494 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: s_endpgm + %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491) + store <4 x float> %result, ptr addrspace(1) %ptr, align 16 + ret void +} + ; This should be optimized to avoid the scale define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a: @@ -2040,7 +2286,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2062,7 +2308,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2073,43 +2319,85 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v20, 1 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) ret <4 x float> %result } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 1 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, 1 +; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) ret <4 x float> %result } @@ -2129,7 +2417,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2153,7 +2441,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2176,7 +2464,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2199,7 +2487,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_ ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2223,7 +2511,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2247,7 +2535,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2271,7 +2559,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2295,7 +2583,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2318,7 +2606,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2341,7 +2629,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4_ ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2364,5 +2652,5 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index 05f8739e7cb89..1e70b3040774f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -40,7 +40,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -85,7 +85,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -134,10 +134,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -179,10 +179,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -231,10 +231,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -276,10 +276,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -328,10 +328,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -373,10 +373,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -425,10 +425,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -470,10 +470,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -522,10 +522,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -567,10 +567,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -619,10 +619,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -664,10 +664,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -716,10 +716,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -761,10 +761,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -815,7 +815,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -868,7 +868,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -913,7 +913,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -963,7 +963,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1015,7 +1015,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1064,7 +1064,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1116,7 +1116,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1165,7 +1165,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1215,7 +1215,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1264,7 +1264,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1317,7 +1317,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1362,7 +1362,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1412,7 +1412,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1465,7 +1465,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1510,7 +1510,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1561,7 +1561,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1613,7 +1613,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1662,7 +1662,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1714,7 +1714,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1763,7 +1763,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1813,7 +1813,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1862,7 +1862,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1914,7 +1914,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1963,7 +1963,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2015,7 +2015,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2064,7 +2064,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2113,7 +2113,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2161,7 +2161,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2210,7 +2210,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2258,7 +2258,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2311,7 +2311,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2360,7 +2360,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2412,7 +2412,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2461,7 +2461,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2510,7 +2510,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2558,7 +2558,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2607,7 +2607,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2655,7 +2655,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2704,7 +2704,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2752,7 +2752,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2801,7 +2801,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2849,7 +2849,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2899,7 +2899,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2948,7 +2948,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2998,7 +2998,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3047,7 +3047,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3096,7 +3096,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3144,7 +3144,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3193,7 +3193,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3241,7 +3241,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3290,7 +3290,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3338,7 +3338,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3387,13 +3387,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a12, v28 ; GCN-NEXT: v_accvgpr_write_b32 a13, v29 ; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: v_mov_b32_e32 v16, s1 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3436,12 +3437,13 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a12, v28 ; GCN-NEXT: v_accvgpr_write_b32 a13, v29 ; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: v_mov_b32_e32 v16, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v31 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3484,12 +3486,13 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a12, v28 ; GCN-NEXT: v_accvgpr_write_b32 a13, v29 ; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: v_mov_b32_e32 v16, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, s0 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v16 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3567,7 +3570,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3637,7 +3640,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[32:39], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3687,11 +3690,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3737,11 +3741,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3791,11 +3796,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3841,11 +3847,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3895,11 +3902,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3945,11 +3953,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3991,11 +4000,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a13, s25 ; SDAG-NEXT: v_accvgpr_write_b32 a14, s26 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s27 +; SDAG-NEXT: v_mov_b32_e32 v17, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4037,11 +4047,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a13, s25 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s26 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s27 +; GISEL-NEXT: v_mov_b32_e32 v17, s28 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4111,7 +4122,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4177,7 +4188,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4200,48 +4211,95 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 33, -2 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, -2 +; SDAG-NEXT: v_mov_b32_e32 v32, 33 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 33 +; GISEL-NEXT: v_mov_b32_e32 v32, -2 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) ret <16 x float> %result } @@ -4251,7 +4309,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v31, -2 +; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 @@ -4269,10 +4328,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, -2 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4296,6 +4355,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: scratch_load_dword a15, off, s32 ; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v32, -2 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 @@ -4313,10 +4373,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, -2 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4338,13 +4398,13 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ret <16 x float> %result } -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: s_movk_i32 s0, 0x41 -; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d +; SDAG-NEXT: v_mov_b32_e32 v31, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 @@ -4362,10 +4422,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4384,12 +4444,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: scratch_load_dword a15, off, s32 ; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d +; GISEL-NEXT: v_mov_b32_e32 v32, 1.0 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 @@ -4407,10 +4467,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4428,72 +4488,355 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216) ret <16 x float> %result } -define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 -; SDAG-NEXT: v_mov_b32_e32 v16, s1 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, -2 +; SDAG-NEXT: v_mov_b32_e32 v32, 1.0 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v32, -2 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v32, 0.15915494 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 0.15915494 +; GISEL-NEXT: v_mov_b32_e32 v32, 1.0 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d +; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) + ret <16 x float> %result +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: v_mov_b32_e32 v17, s1 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] @@ -4514,13 +4857,14 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 -; GISEL-NEXT: v_mov_b32_e32 v16, s1 +; GISEL-NEXT: v_mov_b32_e32 v16, s0 +; GISEL-NEXT: v_mov_b32_e32 v17, s1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 @@ -4536,7 +4880,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; SDAG-NEXT: s_movk_i32 s2, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v16, -2 +; SDAG-NEXT: v_mov_b32_e32 v17, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s8 @@ -4572,11 +4917,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -4588,6 +4933,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] @@ -4615,11 +4961,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -4670,9 +5016,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 ; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 -; SDAG-NEXT: v_mov_b32_e32 v16, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: v_mov_b32_e32 v17, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] ; SDAG-NEXT: v_mov_b32_e32 v0, s20 ; SDAG-NEXT: v_mov_b32_e32 v1, s21 ; SDAG-NEXT: v_mov_b32_e32 v2, s22 @@ -4744,10 +5091,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b32_e32 v16, s1 +; GISEL-NEXT: v_mov_b32_e32 v16, s0 +; GISEL-NEXT: v_mov_b32_e32 v17, s1 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] @@ -4765,7 +5113,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 @@ -4785,6 +5133,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: v_mov_b32_e32 v16, 42 +; SDAG-NEXT: v_mov_b32_e32 v17, 25 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 @@ -4821,7 +5171,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:2 ; SDAG-NEXT: v_mov_b32_e32 v0, s20 ; SDAG-NEXT: v_mov_b32_e32 v1, s21 ; SDAG-NEXT: v_mov_b32_e32 v2, s22 @@ -4865,9 +5215,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 +; GISEL-NEXT: v_mov_b32_e32 v16, 25 +; GISEL-NEXT: v_mov_b32_e32 v17, 42 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] @@ -4893,10 +5243,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] @@ -5059,7 +5410,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 @@ -5079,6 +5430,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: v_mov_b32_e32 v16, 42 +; SDAG-NEXT: v_mov_b32_e32 v17, 25 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 @@ -5115,7 +5468,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:2 ; SDAG-NEXT: v_mov_b32_e32 v0, s20 ; SDAG-NEXT: v_mov_b32_e32 v1, s21 ; SDAG-NEXT: v_mov_b32_e32 v2, s22 @@ -5159,9 +5512,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 +; GISEL-NEXT: v_mov_b32_e32 v16, 25 +; GISEL-NEXT: v_mov_b32_e32 v17, 42 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] @@ -5187,10 +5540,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] @@ -5247,7 +5601,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a( ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5294,7 +5648,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b( ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5317,95 +5671,189 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b( } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 1 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, 1 +; SDAG-NEXT: v_mov_b32_e32 v32, 0 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 0 +; GISEL-NEXT: v_mov_b32_e32 v32, 1 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) ret <16 x float> %result } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 1, 0 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, 0 +; SDAG-NEXT: v_mov_b32_e32 v32, 1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 1 +; GISEL-NEXT: v_mov_b32_e32 v32, 0 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) ret <16 x float> %result } @@ -5441,7 +5889,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5486,7 +5934,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5538,7 +5986,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5583,7 +6031,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5634,7 +6082,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5678,7 +6126,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5727,7 +6175,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_ ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5779,7 +6227,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5824,7 +6272,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5876,7 +6324,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5921,7 +6369,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5972,7 +6420,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6023,7 +6471,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8( ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6074,7 +6522,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6118,7 +6566,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6167,7 +6615,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_ ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6202,6 +6650,6 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { "amdgpu-flat-work-group-size"="128,128" } attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 5eb6d203098ee..076cf09678b57 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -840,6 +840,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] @@ -850,12 +851,14 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -1015,6 +1018,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1025,12 +1029,14 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -1167,9 +1173,10 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -1316,9 +1323,10 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -1468,9 +1476,10 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -1655,9 +1664,10 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -3395,6 +3405,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3405,12 +3416,14 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -3497,6 +3510,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3507,12 +3521,14 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -3722,9 +3738,10 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -3799,9 +3816,10 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -4023,9 +4041,10 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -4104,9 +4123,10 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -8550,6 +8570,7 @@ define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off @@ -8588,6 +8609,7 @@ define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off @@ -8626,6 +8648,7 @@ define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off @@ -8664,6 +8687,7 @@ define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off @@ -8702,6 +8726,7 @@ define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off @@ -8740,6 +8765,7 @@ define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off @@ -8778,6 +8804,7 @@ define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %sr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off @@ -8816,6 +8843,7 @@ define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %s ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off @@ -8877,6 +8905,7 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 @@ -8892,6 +8921,7 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -8954,6 +8984,7 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 % ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 @@ -8969,6 +9000,7 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 % ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 @@ -9057,6 +9089,7 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 @@ -9079,6 +9112,7 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -9174,6 +9208,7 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 @@ -9196,6 +9231,7 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 @@ -9273,6 +9309,7 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 @@ -9290,6 +9327,7 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -9362,6 +9400,7 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 @@ -9379,6 +9418,7 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll index bb42834221681..10c000095fe3d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll @@ -34,6 +34,7 @@ define void @v_permlane16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %s ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 @@ -75,6 +76,7 @@ define void @v_permlanex16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 % ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 @@ -127,6 +129,7 @@ define void @v_permlane16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 @@ -185,6 +188,7 @@ define void @v_permlanex16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 @@ -230,6 +234,7 @@ define void @v_permlane16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off @@ -268,6 +273,7 @@ define void @v_permlanex16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off @@ -310,6 +316,7 @@ define void @v_permlane16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %sr ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -354,6 +361,7 @@ define void @v_permlanex16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %s ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 @@ -394,6 +402,7 @@ define void @v_permlane16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off @@ -432,6 +441,7 @@ define void @v_permlanex16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off @@ -474,6 +484,7 @@ define void @v_permlane16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %sr ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -518,6 +529,7 @@ define void @v_permlanex16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %s ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 @@ -558,6 +570,7 @@ define void @v_permlane16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off @@ -596,6 +609,7 @@ define void @v_permlanex16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off @@ -638,6 +652,7 @@ define void @v_permlane16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %sr ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -682,6 +697,7 @@ define void @v_permlanex16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %s ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll index 22a473e44b273..08d2201036c77 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll @@ -78,15 +78,14 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__ ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 ; GFX12-NEXT: v_readfirstlane_b32 s3, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, s0, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -95,13 +94,11 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__ ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr6 ; GFX12-NEXT: ; implicit-def: $vgpr5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 128 %ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 0605a158b974f..3d60dd5b7ac5f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -163,6 +163,31 @@ define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) { ret void } +; FIXME: Broken +; define void @test_readfirstlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src) { +; %x = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src) +; call void asm sideeffect "; use $0", "s"(<2 x i64> %x) +; ret void +; } + +; define void @test_readfirstlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src) { +; %x = call <3 x i64> @llvm.amdgcn.readfirstlane.v3i64(<3 x i64> %src) +; call void asm sideeffect "; use $0", "s"(<3 x i64> %x) +; ret void +; } + +; define void @test_readfirstlane_v4i64(ptr addrspace(1) %out, <4 x i64> %src) { +; %x = call <4 x i64> @llvm.amdgcn.readfirstlane.v4i64(<4 x i64> %src) +; call void asm sideeffect "; use $0", "s"(<4 x i64> %x) +; ret void +; } + +; define void @test_readfirstlane_v8i64(ptr addrspace(1) %out, <8 x i64> %src) { +; %x = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> %src) +; call void asm sideeffect "; use $0", "s"(<8 x i64> %x) +; ret void +; } + define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_f64: ; CHECK-SDAG: ; %bb.0: @@ -637,6 +662,472 @@ define void @test_readfirstlane_v2f32(ptr addrspace(1) %out, <2 x float> %src) { ret void } +define void @test_readfirstlane_v3f32(ptr addrspace(1) %out, <3 x float> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v3f32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v3f32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:6] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x float> @llvm.amdgcn.readfirstlane.v3f32(<3 x float> %src) + call void asm sideeffect "; use $0", "s"(<3 x float> %x) + ret void +} + +define void @test_readfirstlane_v4f32(ptr addrspace(1) %out, <4 x float> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v4f32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v4f32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <4 x float> @llvm.amdgcn.readfirstlane.v4f32(<4 x float> %src) + call void asm sideeffect "; use $0", "s"(<4 x float> %x) + ret void +} + +define void @test_readfirstlane_v8f32(ptr addrspace(1) %out, <8 x float> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v8f32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:11] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v8f32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:11] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <8 x float> @llvm.amdgcn.readfirstlane.v8f32(<8 x float> %src) + call void asm sideeffect "; use $0", "s"(<8 x float> %x) + ret void +} + +define void @test_readfirstlane_v16f32(ptr addrspace(1) %out, <16 x float> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v16f32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v16f32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <16 x float> @llvm.amdgcn.readfirstlane.v16f32(<16 x float> %src) + call void asm sideeffect "; use $0", "s"(<16 x float> %x) + ret void +} + +define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v32f32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s39, 3 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s48, 4 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s49, 5 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s50, 6 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s51, 7 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s52, 8 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s53, 9 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s54, 10 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s55, 11 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s64, 12 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s65, 13 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s66, 14 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s67, 15 +; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27 +; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s64, v30 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s55, v21 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s54, v20 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s53, v19 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s52, v18 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s51, v17 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s50, v16 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s49, v15 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s48, v14 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s39, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s38, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s37, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s36, v2 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s62, v28 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s60, v26 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s59, v25 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s58, v24 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s57, v23 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s56, v22 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s47, v13 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s46, v12 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s45, v11 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s44, v10 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s43, v9 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s42, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s41, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s40, v6 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(2) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v0 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(1) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v1 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s65, v27 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[36:67] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_readlane_b32 s67, v31, 15 +; CHECK-SDAG-NEXT: v_readlane_b32 s66, v31, 14 +; CHECK-SDAG-NEXT: v_readlane_b32 s65, v31, 13 +; CHECK-SDAG-NEXT: v_readlane_b32 s64, v31, 12 +; CHECK-SDAG-NEXT: v_readlane_b32 s55, v31, 11 +; CHECK-SDAG-NEXT: v_readlane_b32 s54, v31, 10 +; CHECK-SDAG-NEXT: v_readlane_b32 s53, v31, 9 +; CHECK-SDAG-NEXT: v_readlane_b32 s52, v31, 8 +; CHECK-SDAG-NEXT: v_readlane_b32 s51, v31, 7 +; CHECK-SDAG-NEXT: v_readlane_b32 s50, v31, 6 +; CHECK-SDAG-NEXT: v_readlane_b32 s49, v31, 5 +; CHECK-SDAG-NEXT: v_readlane_b32 s48, v31, 4 +; CHECK-SDAG-NEXT: v_readlane_b32 s39, v31, 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s38, v31, 2 +; CHECK-SDAG-NEXT: v_readlane_b32 s37, v31, 1 +; CHECK-SDAG-NEXT: v_readlane_b32 s36, v31, 0 +; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v32f32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s36, 0 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s37, 1 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s38, 2 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s39, 3 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s48, 4 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s49, 5 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s50, 6 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s51, 7 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s52, 8 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s53, 9 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s54, 10 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s55, 11 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s64, 12 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s65, 13 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s66, 14 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s67, 15 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2 +; CHECK-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; CHECK-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; CHECK-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s37, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s38, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s39, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s48, v14 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s49, v15 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s50, v16 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s51, v17 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s52, v18 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s53, v19 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s54, v20 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s55, v21 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s64, v30 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s40, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s41, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s42, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s43, v9 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s44, v10 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s45, v11 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s46, v12 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s47, v13 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s56, v22 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s57, v23 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s58, v24 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s59, v25 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s60, v26 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s61, v27 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s62, v28 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s63, v29 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(2) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s65, v0 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(1) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s66, v1 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s67, v2 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[36:67] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_readlane_b32 s67, v31, 15 +; CHECK-GISEL-NEXT: v_readlane_b32 s66, v31, 14 +; CHECK-GISEL-NEXT: v_readlane_b32 s65, v31, 13 +; CHECK-GISEL-NEXT: v_readlane_b32 s64, v31, 12 +; CHECK-GISEL-NEXT: v_readlane_b32 s55, v31, 11 +; CHECK-GISEL-NEXT: v_readlane_b32 s54, v31, 10 +; CHECK-GISEL-NEXT: v_readlane_b32 s53, v31, 9 +; CHECK-GISEL-NEXT: v_readlane_b32 s52, v31, 8 +; CHECK-GISEL-NEXT: v_readlane_b32 s51, v31, 7 +; CHECK-GISEL-NEXT: v_readlane_b32 s50, v31, 6 +; CHECK-GISEL-NEXT: v_readlane_b32 s49, v31, 5 +; CHECK-GISEL-NEXT: v_readlane_b32 s48, v31, 4 +; CHECK-GISEL-NEXT: v_readlane_b32 s39, v31, 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s38, v31, 2 +; CHECK-GISEL-NEXT: v_readlane_b32 s37, v31, 1 +; CHECK-GISEL-NEXT: v_readlane_b32 s36, v31, 0 +; CHECK-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <32 x float> @llvm.amdgcn.readfirstlane.v32f32(<32 x float> %src) + call void asm sideeffect "; use $0", "s"(<32 x float> %x) + ret void +} + +define void @test_readfirstlane_v2i32(ptr addrspace(1) %out, <2 x i32> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v2i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:5] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v2i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:5] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> %src) + call void asm sideeffect "; use $0", "s"(<2 x i32> %x) + ret void +} + +define void @test_readfirstlane_v3i32(ptr addrspace(1) %out, <3 x i32> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v3i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v3i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:6] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> %src) + call void asm sideeffect "; use $0", "s"(<3 x i32> %x) + ret void +} + +define void @test_readfirstlane_v4i32(ptr addrspace(1) %out, <4 x i32> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v4i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v4i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> %src) + call void asm sideeffect "; use $0", "s"(<4 x i32> %x) + ret void +} + +define void @test_readfirstlane_v5i32(ptr addrspace(1) %out, <5 x i32> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v5i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:8] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v5i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:8] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <5 x i32> @llvm.amdgcn.readfirstlane.v5i32(<5 x i32> %src) + call void asm sideeffect "; use $0", "s"(<5 x i32> %x) + ret void +} + +define void @test_readfirstlane_v6i32(ptr addrspace(1) %out, <6 x i32> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v6i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v6i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:9] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <6 x i32> @llvm.amdgcn.readfirstlane.v6i32(<6 x i32> %src) + call void asm sideeffect "; use $0", "s"(<6 x i32> %x) + ret void +} + define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v7i32: ; CHECK-SDAG: ; %bb.0: @@ -672,6 +1163,271 @@ define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) { ret void } +define void @test_readfirstlane_v8i32(ptr addrspace(1) %out, <8 x i32> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v8i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:11] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v8i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:11] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32(<8 x i32> %src) + call void asm sideeffect "; use $0", "s"(<8 x i32> %x) + ret void +} + +define void @test_readfirstlane_v16i32(ptr addrspace(1) %out, <16 x i32> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v16i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v16i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <16 x i32> @llvm.amdgcn.readfirstlane.v16i32(<16 x i32> %src) + call void asm sideeffect "; use $0", "s"(<16 x i32> %x) + ret void +} + +define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v32i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s39, 3 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s48, 4 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s49, 5 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s50, 6 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s51, 7 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s52, 8 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s53, 9 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s54, 10 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s55, 11 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s64, 12 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s65, 13 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s66, 14 +; CHECK-SDAG-NEXT: v_writelane_b32 v31, s67, 15 +; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27 +; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s64, v30 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s55, v21 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s54, v20 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s53, v19 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s52, v18 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s51, v17 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s50, v16 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s49, v15 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s48, v14 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s39, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s38, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s37, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s36, v2 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s62, v28 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s60, v26 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s59, v25 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s58, v24 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s57, v23 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s56, v22 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s47, v13 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s46, v12 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s45, v11 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s44, v10 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s43, v9 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s42, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s41, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s40, v6 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(2) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v0 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(1) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v1 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s65, v27 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[36:67] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_readlane_b32 s67, v31, 15 +; CHECK-SDAG-NEXT: v_readlane_b32 s66, v31, 14 +; CHECK-SDAG-NEXT: v_readlane_b32 s65, v31, 13 +; CHECK-SDAG-NEXT: v_readlane_b32 s64, v31, 12 +; CHECK-SDAG-NEXT: v_readlane_b32 s55, v31, 11 +; CHECK-SDAG-NEXT: v_readlane_b32 s54, v31, 10 +; CHECK-SDAG-NEXT: v_readlane_b32 s53, v31, 9 +; CHECK-SDAG-NEXT: v_readlane_b32 s52, v31, 8 +; CHECK-SDAG-NEXT: v_readlane_b32 s51, v31, 7 +; CHECK-SDAG-NEXT: v_readlane_b32 s50, v31, 6 +; CHECK-SDAG-NEXT: v_readlane_b32 s49, v31, 5 +; CHECK-SDAG-NEXT: v_readlane_b32 s48, v31, 4 +; CHECK-SDAG-NEXT: v_readlane_b32 s39, v31, 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s38, v31, 2 +; CHECK-SDAG-NEXT: v_readlane_b32 s37, v31, 1 +; CHECK-SDAG-NEXT: v_readlane_b32 s36, v31, 0 +; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v32i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s36, 0 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s37, 1 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s38, 2 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s39, 3 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s48, 4 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s49, 5 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s50, 6 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s51, 7 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s52, 8 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s53, 9 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s54, 10 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s55, 11 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s64, 12 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s65, 13 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s66, 14 +; CHECK-GISEL-NEXT: v_writelane_b32 v31, s67, 15 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2 +; CHECK-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; CHECK-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; CHECK-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s37, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s38, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s39, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s48, v14 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s49, v15 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s50, v16 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s51, v17 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s52, v18 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s53, v19 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s54, v20 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s55, v21 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s64, v30 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s40, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s41, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s42, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s43, v9 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s44, v10 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s45, v11 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s46, v12 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s47, v13 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s56, v22 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s57, v23 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s58, v24 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s59, v25 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s60, v26 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s61, v27 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s62, v28 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s63, v29 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(2) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s65, v0 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(1) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s66, v1 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s67, v2 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[36:67] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_readlane_b32 s67, v31, 15 +; CHECK-GISEL-NEXT: v_readlane_b32 s66, v31, 14 +; CHECK-GISEL-NEXT: v_readlane_b32 s65, v31, 13 +; CHECK-GISEL-NEXT: v_readlane_b32 s64, v31, 12 +; CHECK-GISEL-NEXT: v_readlane_b32 s55, v31, 11 +; CHECK-GISEL-NEXT: v_readlane_b32 s54, v31, 10 +; CHECK-GISEL-NEXT: v_readlane_b32 s53, v31, 9 +; CHECK-GISEL-NEXT: v_readlane_b32 s52, v31, 8 +; CHECK-GISEL-NEXT: v_readlane_b32 s51, v31, 7 +; CHECK-GISEL-NEXT: v_readlane_b32 s50, v31, 6 +; CHECK-GISEL-NEXT: v_readlane_b32 s49, v31, 5 +; CHECK-GISEL-NEXT: v_readlane_b32 s48, v31, 4 +; CHECK-GISEL-NEXT: v_readlane_b32 s39, v31, 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s38, v31, 2 +; CHECK-GISEL-NEXT: v_readlane_b32 s37, v31, 1 +; CHECK-GISEL-NEXT: v_readlane_b32 s36, v31, 0 +; CHECK-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <32 x i32> @llvm.amdgcn.readfirstlane.v32i32(<32 x i32> %src) + call void asm sideeffect "; use $0", "s"(<32 x i32> %x) + ret void +} + define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v8i16: ; CHECK-SDAG: ; %bb.0: @@ -700,3 +1456,148 @@ define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) { call void asm sideeffect "; use $0", "s"(<8 x i16> %x) ret void } + +define void @test_readfirstlane_v16i16(ptr addrspace(1) %out, <16 x i16> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v16i16: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:11] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v16i16: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:11] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <16 x i16> @llvm.amdgcn.readfirstlane.v16i16(<16 x i16> %src) + call void asm sideeffect "; use $0", "s"(<16 x i16> %x) + ret void +} + +define void @test_readfirstlane_v32i16(ptr addrspace(1) %out, <32 x i16> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v32i16: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v32i16: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <32 x i16> @llvm.amdgcn.readfirstlane.v32i16(<32 x i16> %src) + call void asm sideeffect "; use $0", "s"(<32 x i16> %x) + ret void +} + + +define void @test_readfirstlane_v32f16(ptr addrspace(1) %out, <32 x half> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v32f16: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v32f16: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <32 x half> @llvm.amdgcn.readfirstlane.v32f16(<32 x half> %src) + call void asm sideeffect "; use $0", "s"(<32 x half> %x) + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll new file mode 100644 index 0000000000000..0ffee36d520dc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 -enable-var-scope %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 -enable-var-scope %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX11 %s +; Test codegen with readfirstlane used by M0. +; +; M0 can only be written to by SALU instructions so we can't emit +; a v_readfirstlane_b32 m0 + +define void @test_readfirstlane_m0(i32 %arg) { +; GFX10-LABEL: test_readfirstlane_m0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: s_mov_b32 m0, s4 +; GFX10-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_readfirstlane_m0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 m0, s0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %1 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %arg) + call void @llvm.amdgcn.s.sendmsg(i32 1, i32 %1) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll index 5dec1e15cb3d5..af792851e0ced 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll @@ -26,19 +26,13 @@ define amdgpu_cs void @ttracedata_s(i32 inreg %val) { } define amdgpu_cs void @ttracedata_v(i32 %val) { -; GFX11-SDAG-LABEL: ttracedata_v: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_mov_b32 m0, s0 -; GFX11-SDAG-NEXT: s_ttracedata -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: ttracedata_v: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GFX11-GISEL-NEXT: s_ttracedata -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: ttracedata_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 m0, s0 +; GFX11-NEXT: s_ttracedata +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.s.ttracedata(i32 %val) ret void } @@ -51,3 +45,6 @@ define amdgpu_cs void @ttracedata_imm() { call void @llvm.amdgcn.s.ttracedata.imm(i16 1000) ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-GISEL: {{.*}} +; GFX11-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll new file mode 100644 index 0000000000000..0764cd5d34d75 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll @@ -0,0 +1,933 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-minreg < %s | FileCheck -check-prefix=GCN-MINREG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-max-occupancy-experimental < %s | FileCheck -check-prefix=GCN-MAXOCC %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -misched=gcn-iterative-ilp < %s | FileCheck -check-prefix=GCN-ILP %s + +define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: +; GCN-MINREG: ; %bb.0: ; %entry +; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_add_u32_e32 v4, s0, v0 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:112 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:96 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:80 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:64 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:16 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:32 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:48 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: v_add_u32_e32 v5, s1, v0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v0, s1 +; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[24:27] offset:96 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[20:23] offset:80 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[16:19] offset:64 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[12:15] offset:48 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[8:11] offset:32 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[4:7] offset:16 +; GCN-MINREG-NEXT: ds_write_b128 v5, a[0:3] +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:8304 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:8288 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:8272 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:8256 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:8240 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:8224 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:8208 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:8192 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:24688 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:24672 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:24656 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:24640 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:24624 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:24608 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:24592 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:24576 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:49264 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:49248 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:49232 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:49216 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:49200 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:49184 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:49168 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:49152 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:24672 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:24688 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:24640 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:24656 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:24608 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:24624 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:24576 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:24592 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:57456 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:57440 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:57424 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:57408 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:57344 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:57360 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:57376 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:57392 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: s_endpgm +; +; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: +; GCN-MAXOCC: ; %bb.0: ; %entry +; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v1 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v1, s1, v1 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:112 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:96 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:80 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:64 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:48 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, s1 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:8288 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:8304 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:8256 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:8272 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:8224 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:8240 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:8192 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:16480 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:16496 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:16448 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:16464 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:16416 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:16432 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:16384 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16400 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:24672 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:24688 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:24640 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:24656 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:24608 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:24624 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:24576 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:32864 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:32880 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:32832 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:32848 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32800 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:32816 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:32768 +; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:32784 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: s_endpgm +; +; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: +; GCN-ILP: ; %bb.0: ; %entry +; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-ILP-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-ILP-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:112 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: v_mov_b32_e32 v0, s1 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:8288 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:8304 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:8256 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:8272 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:8224 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:8240 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:8192 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:24576 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:24592 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:24656 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:24672 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 2 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:16384 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:16432 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:16416 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:16464 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:16448 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:16496 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:16480 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:24592 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:24576 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:24624 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:24608 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:24656 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:24688 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:24672 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 2 +; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr + %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr + %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0) + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0) + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0) + %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0) + %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx + store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64 + store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr + %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128 + store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192 + store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256 + store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ret void +} + + +define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_split_region(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: +; GCN-MINREG: ; %bb.0: ; %entry +; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MINREG-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-MINREG-NEXT: v_mov_b32_e32 v0, 2.0 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_add_u32_e32 v3, s0, v2 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:48 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: v_add_u32_e32 v2, s1, v2 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 1 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:112 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:96 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:80 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:64 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:48 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:32 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: v_mov_b32_e32 v2, s1 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 1 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:8288 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:8304 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:8256 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:8272 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:8224 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:8240 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:8192 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:8208 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_barrier mask(0x00000000) +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:24688 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:24672 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:24656 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:24640 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:24576 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:24592 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:24608 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:24624 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 1 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:16496 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:16480 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:16464 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:16448 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:16432 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:16416 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16400 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:16384 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:49264 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:49248 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:49232 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:49216 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:49200 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:49184 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:49168 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:49152 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:24688 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:24672 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:24656 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:24640 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:24624 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:24608 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:24592 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:24576 +; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:57456 +; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:57440 +; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:57424 +; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:57408 +; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:57344 +; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:57360 +; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:57376 +; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:57392 +; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 2 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:32880 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:32864 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:32848 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:32832 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:32816 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:32800 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:32784 +; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:32768 +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: s_endpgm +; +; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: +; GCN-MAXOCC: ; %bb.0: ; %entry +; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v3, 0x1ff80, v0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v3 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v3, s1, v3 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:112 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:96 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:80 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:64 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:48 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, s1 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:8288 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:8304 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:8256 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:8272 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:8224 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:8240 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:8192 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:8208 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_barrier mask(0x00000000) +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:16496 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:16480 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:16464 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:16448 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:16432 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:16416 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16400 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:16384 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 1 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:24688 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:24672 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:24656 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:24640 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:24624 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:24608 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:24592 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:24576 +; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456 +; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440 +; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424 +; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408 +; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344 +; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360 +; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376 +; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392 +; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) +; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 2 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:32880 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:32864 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:32848 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:32832 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:32816 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32800 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:32784 +; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:32768 +; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MAXOCC-NEXT: s_endpgm +; +; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: +; GCN-ILP: ; %bb.0: ; %entry +; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-ILP-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 +; GCN-ILP-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v2 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v2, s1, v2 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:48 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:64 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:80 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:96 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:112 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: v_mov_b32_e32 v2, s1 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:8288 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:8304 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:8256 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:8272 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:8224 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:8240 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:8192 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:8208 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_barrier mask(0x00000000) +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:24592 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:24576 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:24656 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:24672 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 2 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:16496 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:16480 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:16464 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:16448 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:16432 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:16416 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16400 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:16384 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 1 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:24576 +; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:24592 +; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:24608 +; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:24624 +; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:24640 +; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:24656 +; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:24672 +; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:24688 +; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456 +; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 2 +; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:32880 +; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:32864 +; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:32848 +; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:32832 +; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:32816 +; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32800 +; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:32784 +; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:32768 +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-ILP-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr + %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0) + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0) + %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx + store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64 + store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; parition the regions + call void @llvm.amdgcn.sched.barrier(i32 0) + %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0) + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0) + %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0) + %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128 + store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192 + store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256 + store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ; 8 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0) + ; 1 MFMA + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 8 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 8, i32 0) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 66c02a9bd0c6a..6b922fcd9b550 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -33,7 +33,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -59,7 +59,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7] ; GISEL-NEXT: s_endpgm bb: @@ -81,7 +81,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> % ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -92,7 +92,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> % ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -112,7 +112,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -123,7 +123,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -143,7 +143,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -154,7 +154,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -187,7 +187,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, < ; SDAG-NEXT: v_mov_b32_e32 v12, s28 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -246,7 +246,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] @@ -279,7 +279,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -317,7 +317,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -389,7 +389,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -461,7 +461,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -561,7 +561,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -650,7 +650,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; GCN-NEXT: s_endpgm bb: @@ -672,7 +672,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bflo ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -692,7 +692,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <1 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -712,7 +712,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <1 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -745,7 +745,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0 ; GCN-NEXT: v_mov_b32_e32 v12, s28 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -788,7 +788,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] @@ -826,7 +826,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -871,7 +871,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, < ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -916,7 +916,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, < ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -989,7 +989,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: @@ -1094,7 +1094,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1105,7 +1105,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1125,7 +1125,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1136,7 +1136,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32 ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1156,7 +1156,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1167,7 +1167,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32 ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1200,7 +1200,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; SDAG-NEXT: v_mov_b32_e32 v12, s28 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1265,7 +1265,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] @@ -1298,7 +1298,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -1336,7 +1336,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1408,7 +1408,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1480,7 +1480,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1580,7 +1580,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1675,7 +1675,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -1701,7 +1701,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: @@ -1723,7 +1723,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1734,7 +1734,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1754,7 +1754,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1765,7 +1765,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1785,7 +1785,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1796,7 +1796,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1829,7 +1829,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v12, s28 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1890,7 +1890,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -1916,7 +1916,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: @@ -1938,7 +1938,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1949,7 +1949,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1969,7 +1969,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1980,7 +1980,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2000,7 +2000,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2011,7 +2011,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2044,7 +2044,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v12, s28 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2105,7 +2105,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -2131,7 +2131,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: @@ -2153,7 +2153,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2164,7 +2164,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2184,7 +2184,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2195,7 +2195,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2215,7 +2215,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2226,7 +2226,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2259,7 +2259,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v12, s28 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2320,7 +2320,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -2346,7 +2346,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: @@ -2368,7 +2368,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2379,7 +2379,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2399,7 +2399,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2410,7 +2410,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2430,7 +2430,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2441,7 +2441,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2474,7 +2474,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v12, s28 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2539,7 +2539,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] @@ -2572,7 +2572,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -2610,7 +2610,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2682,7 +2682,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2754,7 +2754,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2854,7 +2854,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2953,7 +2953,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] @@ -2986,7 +2986,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -3024,7 +3024,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3096,7 +3096,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3168,7 +3168,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3268,7 +3268,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3367,7 +3367,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] @@ -3400,7 +3400,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -3438,7 +3438,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3510,7 +3510,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3582,7 +3582,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3682,7 +3682,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3781,7 +3781,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] @@ -3814,7 +3814,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -3852,7 +3852,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3924,7 +3924,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3996,7 +3996,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4096,7 +4096,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4156,4 +4156,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ret <16 x float> %result } -attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll index 0522d5258b9b5..fed7a8ec105fd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -122,6 +122,7 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll index 21dc07cf28fd6..5b752949859f2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll @@ -1,36 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,SDAG -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GISEL +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN declare void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) { -; SDAG-LABEL: buffer_load_lds_dword: -; SDAG: ; %bb.0: ; %main_body -; SDAG-NEXT: v_mov_b32_e32 v0, 8 -; SDAG-NEXT: s_mov_b32 m0, s4 -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds -; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds -; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds -; SDAG-NEXT: v_mov_b32_e32 v0, s4 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: ds_read_b32 v0, v0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: buffer_load_lds_dword: -; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: s_mov_b32 m0, s4 -; GISEL-NEXT: v_mov_b32_e32 v0, 8 -; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds -; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds -; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds -; GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: ds_read_b32 v0, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: ; return to shader part epilog +; GCN-LABEL: buffer_load_lds_dword: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, 8 +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog main_body: call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll index 2698ce1dc3fe3..a2b9c869c9c9a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll @@ -47,15 +47,14 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 @@ -64,13 +63,11 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB2_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <2 x bfloat> %ret @@ -91,15 +88,14 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 @@ -108,12 +104,10 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr0 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB3_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll index 6e94d4fe9fa27..91217c219c451 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll @@ -271,15 +271,14 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 @@ -288,12 +287,10 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr0 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB4_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -413,15 +410,14 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 @@ -430,12 +426,10 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr0 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB5_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll index 9d8572493b456..80fd1e05477f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll @@ -216,15 +216,14 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 @@ -233,13 +232,11 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB4_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret float %ret @@ -329,15 +326,14 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 @@ -346,13 +342,11 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1200-NEXT: ; implicit-def: $vgpr7 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: s_cbranch_execnz .LBB5_1 ; GFX1200-NEXT: ; %bb.2: ; GFX1200-NEXT: s_mov_b32 exec_lo, s2 ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret <2 x half> %ret diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll index 6da16f0a3b053..13bb72a96142f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll @@ -455,10 +455,10 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX12-NEXT: v_readfirstlane_b32 s5, v2 ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -467,13 +467,11 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -613,15 +611,14 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 ; GFX12-NEXT: v_readfirstlane_b32 s3, v7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, s0, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -630,13 +627,11 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll index aa41ef024d6e0..e75dd7409d51b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll @@ -455,10 +455,10 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX12-NEXT: v_readfirstlane_b32 s5, v2 ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -467,13 +467,11 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -613,15 +611,14 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 ; GFX12-NEXT: v_readfirstlane_b32 s3, v7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, s0, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -630,13 +627,11 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll index cfe9545b074e3..2ece76da1388d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950 %s ; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s @@ -14,32 +14,18 @@ declare void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr ;--------------------------------------------------------------------- define amdgpu_ps float @buffer_load_lds_dwordx3(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { -; GFX950-SDAG-LABEL: buffer_load_lds_dwordx3: -; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 8 -; GFX950-SDAG-NEXT: s_mov_b32 m0, s4 -; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen lds -; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:4 sc0 lds -; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:8 nt lds -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX950-SDAG-NEXT: ds_read_b32 v0, v0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: ; return to shader part epilog -; -; GFX950-GISEL-LABEL: buffer_load_lds_dwordx3: -; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: s_mov_b32 m0, s4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 8 -; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen lds -; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:4 sc0 lds -; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:8 nt lds -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX950-GISEL-NEXT: ds_read_b32 v0, v0 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: ; return to shader part epilog +; GFX950-LABEL: buffer_load_lds_dwordx3: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: v_mov_b32_e32 v0, 8 +; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen lds +; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:4 sc0 lds +; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:8 nt lds +; GFX950-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ds_read_b32 v0, v0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; return to shader part epilog call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 4, i32 1) call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 8, i32 2) @@ -107,32 +93,18 @@ define amdgpu_ps void @buffer_load_lds_dwordx3_vs_imm_offset(ptr addrspace(8) in ;--------------------------------------------------------------------- define amdgpu_ps float @buffer_load_lds_dwordx4(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { -; GFX950-SDAG-LABEL: buffer_load_lds_dwordx4: -; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 8 -; GFX950-SDAG-NEXT: s_mov_b32 m0, s4 -; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen lds -; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:4 sc0 lds -; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:8 nt lds -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX950-SDAG-NEXT: ds_read_b32 v0, v0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: ; return to shader part epilog -; -; GFX950-GISEL-LABEL: buffer_load_lds_dwordx4: -; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: s_mov_b32 m0, s4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 8 -; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen lds -; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:4 sc0 lds -; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:8 nt lds -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX950-GISEL-NEXT: ds_read_b32 v0, v0 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: ; return to shader part epilog +; GFX950-LABEL: buffer_load_lds_dwordx4: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: v_mov_b32_e32 v0, 8 +; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen lds +; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:4 sc0 lds +; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:8 nt lds +; GFX950-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ds_read_b32 v0, v0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; return to shader part epilog call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 4, i32 1) call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 8, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll index 04a9f926acd5b..35c959f2e805c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll @@ -1,36 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,SDAG -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GISEL +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN declare void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) define amdgpu_ps float @buffer_load_lds_dword(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { -; SDAG-LABEL: buffer_load_lds_dword: -; SDAG: ; %bb.0: ; %main_body -; SDAG-NEXT: v_mov_b32_e32 v0, 8 -; SDAG-NEXT: s_mov_b32 m0, s4 -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds -; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds -; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds -; SDAG-NEXT: v_mov_b32_e32 v0, s4 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: ds_read_b32 v0, v0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: buffer_load_lds_dword: -; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: s_mov_b32 m0, s4 -; GISEL-NEXT: v_mov_b32_e32 v0, 8 -; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds -; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds -; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds -; GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: ds_read_b32 v0, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: ; return to shader part epilog +; GCN-LABEL: buffer_load_lds_dword: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, 8 +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog main_body: call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir index 179c9f4f8dc4d..0733d34f5e366 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir @@ -58,7 +58,7 @@ body: | ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] - ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] ; GCN-NEXT: [[S_MAX_U32_:%[0-9]+]]:sgpr_32 = S_MAX_U32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir index 88c35a6417d23..486c08335b170 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir @@ -58,7 +58,7 @@ body: | ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2 ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2 ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]] - ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]] ; GCN-NEXT: [[S_MIN_U32_:%[0-9]+]]:sgpr_32 = S_MIN_U32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]] ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll new file mode 100644 index 0000000000000..684b59c66ee8e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -O0 -stop-after=amdgpu-isel -o - %s | FileCheck --check-prefix=SelDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=legalizer -o - %s | FileCheck --check-prefix=GlobalISel %s + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() +declare i32 @llvm.amdgcn.workitem.id.z() + +define amdgpu_ps void @undefined_workitems(ptr %p, ptr %q, ptr %r) { + ; SelDAG-LABEL: name: undefined_workitems + ; SelDAG: bb.0 (%ir-block.0): + ; SelDAG-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; SelDAG-NEXT: {{ $}} + ; SelDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; SelDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; SelDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; SelDAG-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SelDAG-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SelDAG-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SelDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; SelDAG-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; SelDAG-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; SelDAG-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; SelDAG-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; SelDAG-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]] + ; SelDAG-NEXT: S_ENDPGM 0 + ; + ; GlobalISel-LABEL: name: undefined_workitems + ; GlobalISel: bb.1 (%ir-block.0): + ; GlobalISel-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GlobalISel-NEXT: {{ $}} + ; GlobalISel-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GlobalISel-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GlobalISel-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GlobalISel-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GlobalISel-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GlobalISel-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GlobalISel-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GlobalISel-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GlobalISel-NEXT: [[MV2:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GlobalISel-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GlobalISel-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GlobalISel-NEXT: G_STORE [[COPY6]](s32), [[MV]](p0) :: (store (s32) into %ir.p) + ; GlobalISel-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GlobalISel-NEXT: G_STORE [[COPY7]](s32), [[MV1]](p0) :: (store (s32) into %ir.q) + ; GlobalISel-NEXT: G_STORE [[DEF]](s32), [[MV2]](p0) :: (store (s32) into %ir.r) + ; GlobalISel-NEXT: S_ENDPGM 0 + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + store i32 %id.x, ptr %p + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + store i32 %id.y, ptr %q + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store i32 %id.z, ptr %r + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 7342c366799e9..efbf830c95d10 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -887,16 +887,16 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1] ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v2 ; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_nop 1 +; GFX802-SDAG-NEXT: v_writelane_b32 v0, 0, s2 ; GFX802-SDAG-NEXT: v_writelane_b32 v1, s4, m0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; GFX802-SDAG-NEXT: v_writelane_b32 v0, 0, s2 ; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX802-SDAG-NEXT: s_endpgm ; @@ -1407,7 +1407,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ret void } -define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { +define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0 @@ -1486,7 +1486,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ret void } -define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { +define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ret void } -define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 { +define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 @@ -1936,11 +1936,11 @@ define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) { ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_ushort v4, v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: s_nop 1 -; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0 ; GFX802-SDAG-NEXT: flat_store_short v[0:1], v4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2014,11 +2014,11 @@ define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1) ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: s_nop 1 -; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2092,11 +2092,11 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_ushort v4, v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: s_nop 1 -; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0 ; GFX802-SDAG-NEXT: flat_store_short v[0:1], v4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2128,11 +2128,11 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-GISEL-NEXT: flat_load_ushort v4, v[0:1] -; GFX802-GISEL-NEXT: v_readfirstlane_b32 m0, v3 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX802-GISEL-NEXT: s_nop 1 -; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v4, s5, m0 ; GFX802-GISEL-NEXT: flat_store_short v[0:1], v4 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2170,11 +2170,11 @@ define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) { ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_ushort v4, v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: s_nop 1 -; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0 ; GFX802-SDAG-NEXT: flat_store_short v[0:1], v4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2248,11 +2248,11 @@ define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %s ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: s_nop 1 -; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2326,13 +2326,13 @@ define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %s ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_dwordx2 v[5:6], v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v4 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: s_nop 0 -; GFX802-SDAG-NEXT: v_writelane_b32 v6, s4, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v5, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v5, s6, m0 ; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[5:6] ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2419,23 +2419,24 @@ define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %sr ; GFX802-SDAG-NEXT: flat_load_dwordx4 v[10:13], v[0:1] ; GFX802-SDAG-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc ; GFX802-SDAG-NEXT: flat_load_dwordx3 v[14:16], v[17:18] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v9 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v4 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v2 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v8 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v7 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v6 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX802-SDAG-NEXT: v_writelane_b32 v13, s7, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v12, s8, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v11, s9, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v10, s10, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v13, s8, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v12, s9, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v11, s10, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v10, s11, m0 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: v_writelane_b32 v16, s4, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v15, s5, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v14, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v16, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v15, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v14, s7, m0 ; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[10:13] ; GFX802-SDAG-NEXT: flat_store_dwordx3 v[17:18], v[14:16] ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -2592,16 +2593,17 @@ define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %sr ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_dwordx4 v[7:10], v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v6 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v5 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v4 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: v_writelane_b32 v10, s4, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v9, s5, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v8, s6, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v7, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v10, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v9, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v8, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v7, s8, m0 ; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll index edc1afe410a63..40e124382df95 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll @@ -8,13 +8,13 @@ define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) { ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_dwordx2 v[5:6], v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v4 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: s_nop 0 -; GFX802-SDAG-NEXT: v_writelane_b32 v6, s4, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v5, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v5, s6, m0 ; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[5:6] ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -59,21 +59,22 @@ define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src ; GFX802-SDAG-NEXT: flat_load_dwordx4 v[9:12], v[0:1] ; GFX802-SDAG-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc ; GFX802-SDAG-NEXT: flat_load_dwordx2 v[15:16], v[13:14] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v8 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v5 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v4 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v2 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v7 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v6 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v7 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v6 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX802-SDAG-NEXT: v_writelane_b32 v12, s6, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v11, s7, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v10, s8, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v9, s9, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v12, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v11, s8, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v10, s9, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v9, s10, m0 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: v_writelane_b32 v16, s4, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v15, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v16, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v15, s6, m0 ; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[9:12] ; GFX802-SDAG-NEXT: flat_store_dwordx2 v[13:14], v[15:16] ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -141,11 +142,11 @@ define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: s_nop 1 -; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -183,14 +184,15 @@ define void @test_writelane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> % ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_dwordx3 v[6:8], v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v5 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: v_writelane_b32 v8, s4, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v7, s5, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v6, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v8, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v7, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s7, m0 ; GFX802-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[6:8] ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -237,11 +239,11 @@ define void @test_writelane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: s_nop 1 -; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -279,14 +281,15 @@ define void @test_writelane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> % ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_dwordx3 v[6:8], v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v5 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: v_writelane_b32 v8, s4, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v7, s5, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v6, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v8, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v7, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s7, m0 ; GFX802-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[6:8] ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -333,11 +336,11 @@ define void @test_writelane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: s_nop 1 -; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -375,14 +378,15 @@ define void @test_writelane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> % ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-SDAG-NEXT: flat_load_dwordx3 v[6:8], v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v5 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: v_writelane_b32 v8, s4, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v7, s5, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v6, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v8, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v7, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s7, m0 ; GFX802-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[6:8] ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index b850428a03c05..87a659de7c95f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -14,30 +14,31 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s0, s[4:5], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s1, v2 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 -; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, -v2 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3377d1cf +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, v3 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log_f32: @@ -70,32 +71,34 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; VI-SDAG-NEXT: v_ldexp_f32 v0, s0, v0 -; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v3, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v4, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 ; VI-SDAG-NEXT: s_endpgm ; @@ -132,29 +135,30 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-SDAG-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s6, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 -; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s0, -v2 -; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s1, v3 -; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX900-SDAG-NEXT: global_store_dword v0, v1, s[2:3] +; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217 +; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3377d1cf +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s2, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s3, v4 +; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_f32: @@ -188,26 +192,25 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x41b17218, s1 +; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s0, s1 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| +; GFX1100-SDAG-NEXT: v_fma_f32 v3, 0x3f317217, v1, -v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_fmamk_f32 v3, v1, 0x3377d1cf, v3 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm @@ -316,44 +319,46 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3377d1cf ; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s3, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s3, 0x3f317217 -; SI-SDAG-NEXT: s_mov_b32 s4, s0 -; SI-SDAG-NEXT: s_mov_b32 s5, s1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 -; SI-SDAG-NEXT: v_fma_f32 v3, v1, s3, -v2 -; SI-SDAG-NEXT: v_fma_f32 v3, v1, s8, v3 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s7, v3 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: s_mov_b32 s0, s4 +; SI-SDAG-NEXT: s_mov_b32 s1, s5 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: s_mov_b32 s7, 0x3f317217 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v3 +; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, -v4 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; SI-SDAG-NEXT: v_log_f32_e32 v5, v1 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v5 +; SI-SDAG-NEXT: v_fma_f32 v3, v5, s7, -v2 +; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s9 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s2, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v0 -; SI-SDAG-NEXT: v_fma_f32 v4, v0, s3, -v3 -; SI-SDAG-NEXT: v_fma_f32 v4, v0, s8, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s9 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; SI-SDAG-NEXT: s_mov_b32 s6, -1 -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log_v2f32: @@ -398,49 +403,51 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s7, v1 -; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5 +; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; VI-SDAG-NEXT: v_log_f32_e32 v5, v1 +; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s2 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; VI-SDAG-NEXT: v_ldexp_f32 v0, s6, v0 -; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 -; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v4, v0, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm ; @@ -494,41 +501,43 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217 -; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3377d1cf -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x3377d1cf +; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x7f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s11, v1 -; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v1 -; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s2, -v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s3, v4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, s4 +; GFX900-SDAG-NEXT: v_ldexp_f32 v4, s3, v4 +; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3f317217 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v4 +; GFX900-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v4, s3, -v5 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v4, s6, v6 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v1 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s7 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v6 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v6, s3, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v6, s6, v4 ; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, s10, v0 -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s2, -v4 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s3, v5 -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] -; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 -; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s7 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 +; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_v2f32: @@ -574,39 +583,37 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x41b17218, s4 +; GFX1100-SDAG-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s5 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s3, s4 +; GFX1100-SDAG-NEXT: s_and_b32 s5, s5, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s5, 32, 0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s2, s5 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v3, 0x3f317217, v1 :: v_dual_lshlrev_b32 v0, 5, v0 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_fma_f32 v5, 0x3f317217, v1, -v3 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v5, 0x3377d1cf, v1 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3f317217, v0 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x41b17218, s5 -; GFX1100-SDAG-NEXT: v_fma_f32 v4, 0x3f317217, v0, -v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v4, 0x3377d1cf, v0 -; GFX1100-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0 -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5 -; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v3 +; GFX1100-SDAG-NEXT: v_fma_f32 v6, 0x3f317217, v1, -v4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fma_f32 v7, 0x3f317217, v3, -v5 +; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v6, 0x3377d1cf, v1 :: v_dual_fmac_f32 v7, 0x3377d1cf, v3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_add_f32 v4, v4, v6 :: v_dual_add_f32 v5, v5, v7 +; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_mov_b32 v4, 0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2 +; GFX1100-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log_v2f32: @@ -762,56 +769,59 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log_v3f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218 +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3f317217 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: s_mov_b32 s11, 0x3377d1cf -; SI-SDAG-NEXT: s_mov_b32 s12, 0x7f800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s9, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s9, 0x3f317217 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 -; SI-SDAG-NEXT: v_fma_f32 v3, v1, s9, -v2 -; SI-SDAG-NEXT: v_fma_f32 v3, v1, s11, v3 -; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s12 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, s8, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s9, v3 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v3 +; SI-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s6, -v4 +; SI-SDAG-NEXT: s_mov_b32 s7, 0x3377d1cf +; SI-SDAG-NEXT: v_ldexp_f32_e32 v6, s8, v6 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, v5 +; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000 +; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v2, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v2 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s9, -v4 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s11, v5 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v6 +; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; SI-SDAG-NEXT: v_fma_f32 v4, v6, s6, -v3 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; SI-SDAG-NEXT: v_fma_f32 v4, v6, s7, v4 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s10, v0 -; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; SI-SDAG-NEXT: v_log_f32_e32 v5, v0 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s12 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v4, s[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v5 -; SI-SDAG-NEXT: v_fma_f32 v4, v5, s9, -v2 -; SI-SDAG-NEXT: v_fma_f32 v4, v5, s11, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v5|, s12 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; SI-SDAG-NEXT: v_log_f32_e32 v4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v4 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 +; SI-SDAG-NEXT: v_fma_f32 v5, v4, s6, -v3 +; SI-SDAG-NEXT: v_fma_f32 v5, v4, s7, v5 +; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log_v3f32: @@ -871,55 +881,59 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s10, v1 -; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; VI-SDAG-NEXT: v_ldexp_f32 v2, s9, v2 -; VI-SDAG-NEXT: v_log_f32_e32 v3, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v3 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v1 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4 +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v1 -; VI-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 -; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[2:3] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 -; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v5, v0, v3 +; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v6, s1, v6 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 +; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v6 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-SDAG-NEXT: v_sub_f32_e32 v4, v6, v3 +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v3 +; VI-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v8, v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v7, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v1 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 +; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v4 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v4, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v3 @@ -927,12 +941,12 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm ; @@ -1005,55 +1019,58 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; GFX900-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; GFX900-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v3 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf -; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s10, v1 -; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x7f800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x41b17218 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 -; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s4, -v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s5, v4 -; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s10 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX900-SDAG-NEXT: v_ldexp_f32 v2, s9, v2 -; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v2 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v4 -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v4, s4, -v1 -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v4, s5, v6 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s10 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[2:3] -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s4, -v4 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s5, v6 -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s10 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 -; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX900-SDAG-NEXT: v_fma_f32 v6, v3, s4, -v5 +; GFX900-SDAG-NEXT: v_ldexp_f32 v7, s1, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v3, s5, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s8 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v7 +; GFX900-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v7, s4, -v3 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v7, s5, v5 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v1 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s8 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v5 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v5, s4, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v5, s5, v6 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s8 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 +; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_v3f32: @@ -1113,60 +1130,52 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s6 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x41b17218, s3 +; GFX1100-SDAG-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s6 +; GFX1100-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; GFX1100-SDAG-NEXT: s_and_b32 s6, s6, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s6, 32, 0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s2, s3 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v4, s1, s6 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x41b17218, s7 +; GFX1100-SDAG-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 32, 0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v2 :: v_dual_lshlrev_b32 v0, 5, v0 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s2, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_fma_f32 v8, 0x3f317217, v2, -v5 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v8, 0x3377d1cf, v2 -; GFX1100-SDAG-NEXT: v_add_f32_e32 v5, v5, v8 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v0 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fma_f32 v6, 0x3f317217, v0, -v3 -; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v6, 0x3377d1cf, v0 :: v_dual_lshlrev_b32 v1, 5, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s1, v1 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v5, s0, s2 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v1 -; GFX1100-SDAG-NEXT: v_fma_f32 v7, 0x3f317217, v1, -v4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 -; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, 0x3f317217, v2 :: v_dual_mul_f32 v7, 0x3f317217, v4 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| -; GFX1100-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v3, v2, v5, vcc_lo -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6 +; GFX1100-SDAG-NEXT: v_fma_f32 v9, 0x3f317217, v2, -v6 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fma_f32 v10, 0x3f317217, v4, -v7 +; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v9, 0x3377d1cf, v2 :: v_dual_fmac_f32 v10, 0x3377d1cf, v4 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v8, 0x3f317217, v5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_dual_add_f32 v6, v6, v9 :: v_dual_add_f32 v7, v7, v10 +; GFX1100-SDAG-NEXT: v_fma_f32 v11, 0x3f317217, v5, -v8 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v4| +; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v11, 0x3377d1cf, v5 :: v_dual_sub_f32 v2, v2, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_add_f32_e32 v8, v8, v11 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v5| +; GFX1100-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_sub_f32 v1, v4, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] +; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log_v3f32: @@ -1387,68 +1396,72 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log_v4f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-SDAG-NEXT: s_mov_b32 s12, 0x3377d1cf -; SI-SDAG-NEXT: s_mov_b32 s13, 0x7f800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3f317217 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s11, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s11, 0x3f317217 -; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 -; SI-SDAG-NEXT: v_fma_f32 v3, v1, s11, -v2 -; SI-SDAG-NEXT: v_fma_f32 v3, v1, s12, v3 -; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, s10, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s11, v3 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v3 +; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s6, -v4 +; SI-SDAG-NEXT: s_mov_b32 s7, 0x3377d1cf +; SI-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, v5 +; SI-SDAG-NEXT: s_mov_b32 s11, 0x7f800000 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v6, s10, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s11 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v2 -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s11, -v1 -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s12, v5 -; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v5, s9, v5 -; SI-SDAG-NEXT: v_log_f32_e32 v5, v5 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1] -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; SI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v5 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s8, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v5, s11, -v1 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v5, s12, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 -; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v0, s11, -v5 -; SI-SDAG-NEXT: v_fma_f32 v6, v0, s12, v6 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; SI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v6 +; SI-SDAG-NEXT: v_mov_b32_e32 v7, s4 +; SI-SDAG-NEXT: v_fma_f32 v4, v6, s6, -v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v7, s9, v7 +; SI-SDAG-NEXT: v_fma_f32 v4, v6, s7, v4 +; SI-SDAG-NEXT: v_log_f32_e32 v7, v7 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s11 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5 +; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v7 +; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; SI-SDAG-NEXT: v_fma_f32 v6, v7, s6, -v5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; SI-SDAG-NEXT: v_fma_f32 v6, v7, s7, v6 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s8, v1 ; SI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] -; SI-SDAG-NEXT: s_mov_b32 s6, -1 -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-SDAG-NEXT: v_log_f32_e32 v6, v1 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s11 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v6 +; SI-SDAG-NEXT: v_fma_f32 v5, v6, s6, -v4 +; SI-SDAG-NEXT: v_fma_f32 v5, v6, s7, v5 +; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s11 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log_v4f32: @@ -1520,84 +1533,88 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 ; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s11, v1 -; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; VI-SDAG-NEXT: v_ldexp_f32 v2, s10, v2 -; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v5, v2, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s6, 32, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s6 +; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v5, s9, v5 -; VI-SDAG-NEXT: v_log_f32_e32 v5, v5 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1] -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; VI-SDAG-NEXT: v_sub_f32_e32 v6, v5, v1 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v1 -; VI-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 -; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[2:3] -; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 -; VI-SDAG-NEXT: v_and_b32_e32 v5, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, v0, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v6 +; VI-SDAG-NEXT: v_mov_b32_e32 v6, s3 +; VI-SDAG-NEXT: v_ldexp_f32 v6, s2, v6 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s8 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v6 +; VI-SDAG-NEXT: v_sub_f32_e32 v4, v6, v2 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v2 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: v_add_f32_e32 v4, v8, v4 +; VI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v7, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 +; VI-SDAG-NEXT: v_ldexp_f32 v7, s1, v7 +; VI-SDAG-NEXT: v_log_f32_e32 v7, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s8 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5 +; VI-SDAG-NEXT: v_and_b32_e32 v5, 0xfffff000, v7 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-SDAG-NEXT: v_sub_f32_e32 v6, v7, v5 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3f317000, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v5 +; VI-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; VI-SDAG-NEXT: v_add_f32_e32 v6, v9, v6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v1 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s8 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v6 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v6, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s8 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1690,67 +1707,71 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 -; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; GFX900-SDAG-NEXT: s_mov_b32 s8, 0x3f317217 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s11, v1 -; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: s_mov_b32 s11, 0x7f800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x41b17218 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 -; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s4, -v2 -; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s5, v3 -; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX900-SDAG-NEXT: v_ldexp_f32 v2, s10, v2 -; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v2, s4, -v1 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v2, s5, v6 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v6, 5, v6 -; GFX900-SDAG-NEXT: v_ldexp_f32 v6, s9, v6 -; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v6 -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v6, s4, -v1 -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v6, s5, v7 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[2:3] -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v6 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317217, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s4, -v6 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s5, v7 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v3 +; GFX900-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; GFX900-SDAG-NEXT: s_mov_b32 s9, 0x3377d1cf +; GFX900-SDAG-NEXT: v_fma_f32 v6, v3, s8, -v5 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, s3 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v3, s9, v6 +; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x7f800000 +; GFX900-SDAG-NEXT: v_ldexp_f32 v7, s2, v7 +; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s10 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v7 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v8, s2 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v7, s8, -v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v8, s1, v8 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v7, s9, v5 +; GFX900-SDAG-NEXT: v_log_f32_e32 v8, v8 +; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s10 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317217, v8 +; GFX900-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v8, s8, -v6 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v8, s9, v7 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; GFX900-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[0:1] -; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 +; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v1 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v8|, s10 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v7, s8, -v5 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v7, s9, v6 +; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s10 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v5, v0 ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; @@ -1824,68 +1845,61 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s1 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s9 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s6 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s1, v2 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s0, v3 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x41b17218, s6 +; GFX1100-SDAG-NEXT: s_and_b32 s6, s6, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s6, 32, 0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s7 +; GFX1100-SDAG-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s3, s6 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8 +; GFX1100-SDAG-NEXT: s_cselect_b32 s7, 32, 0 +; GFX1100-SDAG-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; GFX1100-SDAG-NEXT: s_and_b32 s6, s9, exec_lo +; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s2, s7 +; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v6, s1, s3 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v7, s0, s2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x41b17218, s9 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v8, 0x3f317217, v2 :: v_dual_mul_f32 v9, 0x3f317217, v3 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317217, v2 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v8, 0x3f317217, v3 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_fma_f32 v12, 0x3f317217, v2, -v7 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_fma_f32 v13, 0x3f317217, v3, -v8 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v10, 0x3f317217, v6 :: v_dual_mul_f32 v11, 0x3f317217, v7 +; GFX1100-SDAG-NEXT: v_fma_f32 v12, 0x3f317217, v2, -v8 +; GFX1100-SDAG-NEXT: v_fma_f32 v13, 0x3f317217, v3, -v9 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_fma_f32 v14, 0x3f317217, v6, -v10 +; GFX1100-SDAG-NEXT: v_fma_f32 v15, 0x3f317217, v7, -v11 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3 -; GFX1100-SDAG-NEXT: v_add_f32_e32 v7, v7, v12 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_add_f32 v8, v8, v13 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317217, v1 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_fma_f32 v10, 0x3f317217, v0, -v5 -; GFX1100-SDAG-NEXT: v_fma_f32 v11, 0x3f317217, v1, -v6 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v10, 0x3377d1cf, v0 :: v_dual_fmac_f32 v11, 0x3377d1cf, v1 -; GFX1100-SDAG-NEXT: v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc_lo +; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v14, 0x3377d1cf, v6 :: v_dual_fmac_f32 v15, 0x3377d1cf, v7 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_dual_add_f32 v8, v8, v12 :: v_dual_add_f32 v9, v9, v13 +; GFX1100-SDAG-NEXT: v_dual_add_f32 v10, v10, v14 :: v_dual_add_f32 v11, v11, v15 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_sub_f32 v2, v1, v9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc_lo -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4 -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15 +; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v8, v3, v9 :: v_dual_mov_b32 v9, 0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v6| +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v7| +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1] +; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log_v4f32: @@ -2143,8 +2157,7 @@ define float @v_log_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -2189,8 +2202,7 @@ define float @v_log_f32(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -2241,8 +2253,7 @@ define float @v_log_f32(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -2286,22 +2297,21 @@ define float @v_log_f32(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2347,8 +2357,7 @@ define float @v_log_fabs_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, |v0|, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -2393,8 +2402,7 @@ define float @v_log_fabs_f32(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -2445,8 +2453,7 @@ define float @v_log_fabs_f32(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -2491,22 +2498,20 @@ define float @v_log_fabs_f32(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, s0 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2554,8 +2559,7 @@ define float @v_log_fneg_fabs_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, -|v0|, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -2600,8 +2604,7 @@ define float @v_log_fneg_fabs_f32(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -2652,8 +2655,7 @@ define float @v_log_fneg_fabs_f32(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -2698,22 +2700,20 @@ define float @v_log_fneg_fabs_f32(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e64 s0, 0x80800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, s0 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2762,8 +2762,7 @@ define float @v_log_fneg_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, -v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -2808,8 +2807,7 @@ define float @v_log_fneg_f32(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, -v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -2860,8 +2858,7 @@ define float @v_log_fneg_f32(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, -v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -2905,22 +2902,21 @@ define float @v_log_fneg_f32(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, -v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2968,8 +2964,7 @@ define float @v_log_f32_fast(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -2995,8 +2990,7 @@ define float @v_log_f32_fast(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3022,8 +3016,7 @@ define float @v_log_f32_fast(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3048,12 +3041,10 @@ define float @v_log_f32_fast(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1 @@ -3088,8 +3079,7 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3115,8 +3105,7 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3142,8 +3131,7 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3168,12 +3156,10 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1 @@ -3208,8 +3194,7 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3235,8 +3220,7 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3262,8 +3246,7 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3288,12 +3271,10 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1 @@ -3328,8 +3309,7 @@ define float @v_log_f32_ninf(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -3374,8 +3354,7 @@ define float @v_log_f32_ninf(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -3426,8 +3405,7 @@ define float @v_log_f32_ninf(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -3471,22 +3449,21 @@ define float @v_log_f32_ninf(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3532,8 +3509,7 @@ define float @v_log_f32_afn(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3559,8 +3535,7 @@ define float @v_log_f32_afn(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3586,8 +3561,7 @@ define float @v_log_f32_afn(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3612,12 +3586,10 @@ define float @v_log_f32_afn(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1 @@ -3681,8 +3653,7 @@ define float @v_log_f32_afn_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3708,8 +3679,7 @@ define float @v_log_f32_afn_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3735,8 +3705,7 @@ define float @v_log_f32_afn_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3761,12 +3730,10 @@ define float @v_log_f32_afn_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1 @@ -3801,8 +3768,7 @@ define float @v_fabs_log_f32_afn(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3828,8 +3794,7 @@ define float @v_fabs_log_f32_afn(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3855,8 +3820,7 @@ define float @v_fabs_log_f32_afn(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 @@ -3882,11 +3846,10 @@ define float @v_fabs_log_f32_afn(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, s0 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1 @@ -4063,8 +4026,7 @@ define float @v_log_f32_nnan(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -4109,8 +4071,7 @@ define float @v_log_f32_nnan(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -4161,8 +4122,7 @@ define float @v_log_f32_nnan(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -4206,22 +4166,21 @@ define float @v_log_f32_nnan(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4407,8 +4366,7 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -4453,8 +4411,7 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -4505,8 +4462,7 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -4550,22 +4506,21 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4751,8 +4706,7 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -4797,8 +4751,7 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -4849,8 +4802,7 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -4894,22 +4846,21 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4955,8 +4906,7 @@ define float @v_log_f32_nnan_ninf(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -4995,8 +4945,7 @@ define float @v_log_f32_nnan_ninf(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -5041,8 +4990,7 @@ define float @v_log_f32_nnan_ninf(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -5080,20 +5028,18 @@ define float @v_log_f32_nnan_ninf(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5240,8 +5186,7 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -5280,8 +5225,7 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -5326,8 +5270,7 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -5365,20 +5308,18 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5451,8 +5392,7 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -5497,8 +5437,7 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -5549,8 +5488,7 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -5594,22 +5532,21 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6132,8 +6069,7 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 @@ -6302,8 +6238,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x800000 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-NEXT: v_log_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s4, 0x3f317217 @@ -6326,8 +6261,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) { ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: s_mov_b32 s4, 0x800000 ; VI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-NEXT: v_log_f32_e32 v0, v0 ; VI-NEXT: s_mov_b32 s4, 0x7f800000 @@ -6353,8 +6287,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) { ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX900-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-NEXT: v_log_f32_e32 v0, v0 ; GFX900-NEXT: s_mov_b32 s4, 0x3f317217 @@ -6377,22 +6310,20 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) { ; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index d09df75837339..d7cefd6ed12ec 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -14,30 +14,31 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log10_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s0, s[4:5], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a -; SI-SDAG-NEXT: s_mov_b32 s6, -1 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s1, v2 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 -; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, -v2 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3284fbcf +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, v3 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log10_f32: @@ -70,32 +71,34 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log10_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; VI-SDAG-NEXT: v_ldexp_f32 v0, s0, v0 -; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v3, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v4, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 ; VI-SDAG-NEXT: s_endpgm ; @@ -132,29 +135,30 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-SDAG-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s6, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a -; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 -; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s0, -v2 -; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s1, v3 -; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX900-SDAG-NEXT: global_store_dword v0, v1, s[2:3] +; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3e9a209a +; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3284fbcf +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s2, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s3, v4 +; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_f32: @@ -188,26 +192,25 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x411a209b, s1 +; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s0, s1 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| +; GFX1100-SDAG-NEXT: v_fma_f32 v3, 0x3e9a209a, v1, -v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_fmamk_f32 v3, v1, 0x3284fbcf, v3 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm @@ -316,44 +319,46 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3284fbcf ; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s3, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s3, 0x3e9a209a -; SI-SDAG-NEXT: s_mov_b32 s4, s0 -; SI-SDAG-NEXT: s_mov_b32 s5, s1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 -; SI-SDAG-NEXT: v_fma_f32 v3, v1, s3, -v2 -; SI-SDAG-NEXT: v_fma_f32 v3, v1, s8, v3 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s7, v3 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: s_mov_b32 s0, s4 +; SI-SDAG-NEXT: s_mov_b32 s1, s5 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: s_mov_b32 s7, 0x3e9a209a +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v3 +; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, -v4 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; SI-SDAG-NEXT: v_log_f32_e32 v5, v1 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v5 +; SI-SDAG-NEXT: v_fma_f32 v3, v5, s7, -v2 +; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s9 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s2, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b -; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v0 -; SI-SDAG-NEXT: v_fma_f32 v4, v0, s3, -v3 -; SI-SDAG-NEXT: v_fma_f32 v4, v0, s8, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s9 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; SI-SDAG-NEXT: s_mov_b32 s6, -1 -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log10_v2f32: @@ -398,49 +403,51 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s7, v1 -; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5 +; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; VI-SDAG-NEXT: v_log_f32_e32 v5, v1 +; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s2 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; VI-SDAG-NEXT: v_ldexp_f32 v0, s6, v0 -; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 -; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v4, v0, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm ; @@ -494,41 +501,43 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3e9a209a -; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3284fbcf -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x3284fbcf +; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x7f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s11, v1 -; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v1 -; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s2, -v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s3, v4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, s4 +; GFX900-SDAG-NEXT: v_ldexp_f32 v4, s3, v4 +; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3e9a209a +; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v4 +; GFX900-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v4, s3, -v5 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v4, s6, v6 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v1 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s7 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v6 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v6, s3, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v6, s6, v4 ; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, s10, v0 -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s2, -v4 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s3, v5 -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] -; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 -; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s7 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 +; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_v2f32: @@ -574,39 +583,37 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x411a209b, s4 +; GFX1100-SDAG-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s5 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s3, s4 +; GFX1100-SDAG-NEXT: s_and_b32 s5, s5, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s5, 32, 0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s2, s5 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v3, 0x3e9a209a, v1 :: v_dual_lshlrev_b32 v0, 5, v0 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_fma_f32 v5, 0x3e9a209a, v1, -v3 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v5, 0x3284fbcf, v1 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3e9a209a, v0 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x411a209b, s5 -; GFX1100-SDAG-NEXT: v_fma_f32 v4, 0x3e9a209a, v0, -v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v4, 0x3284fbcf, v0 -; GFX1100-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0 -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5 -; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v3 +; GFX1100-SDAG-NEXT: v_fma_f32 v6, 0x3e9a209a, v1, -v4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fma_f32 v7, 0x3e9a209a, v3, -v5 +; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v6, 0x3284fbcf, v1 :: v_dual_fmac_f32 v7, 0x3284fbcf, v3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_add_f32 v4, v4, v6 :: v_dual_add_f32 v5, v5, v7 +; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_mov_b32 v4, 0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2 +; GFX1100-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log10_v2f32: @@ -762,56 +769,59 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-SDAG-LABEL: s_log10_v3f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3e9a209a ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: s_mov_b32 s11, 0x3284fbcf -; SI-SDAG-NEXT: s_mov_b32 s12, 0x7f800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s9, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s9, 0x3e9a209a -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 -; SI-SDAG-NEXT: v_fma_f32 v3, v1, s9, -v2 -; SI-SDAG-NEXT: v_fma_f32 v3, v1, s11, v3 -; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s12 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, s8, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b -; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s9, v3 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v3 +; SI-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s6, -v4 +; SI-SDAG-NEXT: s_mov_b32 s7, 0x3284fbcf +; SI-SDAG-NEXT: v_ldexp_f32_e32 v6, s8, v6 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, v5 +; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000 +; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v2, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v2 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s9, -v4 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s11, v5 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v6 +; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; SI-SDAG-NEXT: v_fma_f32 v4, v6, s6, -v3 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; SI-SDAG-NEXT: v_fma_f32 v4, v6, s7, v4 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s10, v0 -; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; SI-SDAG-NEXT: v_log_f32_e32 v5, v0 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s12 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v4, s[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v5 -; SI-SDAG-NEXT: v_fma_f32 v4, v5, s9, -v2 -; SI-SDAG-NEXT: v_fma_f32 v4, v5, s11, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v5|, s12 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; SI-SDAG-NEXT: v_log_f32_e32 v4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v4 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 +; SI-SDAG-NEXT: v_fma_f32 v5, v4, s6, -v3 +; SI-SDAG-NEXT: v_fma_f32 v5, v4, s7, v5 +; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s9 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log10_v3f32: @@ -871,55 +881,59 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s10, v1 -; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; VI-SDAG-NEXT: v_ldexp_f32 v2, s9, v2 -; VI-SDAG-NEXT: v_log_f32_e32 v3, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v3 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v1 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4 +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v1 -; VI-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 -; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[2:3] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 -; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v5, v0, v3 +; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v6, s1, v6 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 +; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v6 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-SDAG-NEXT: v_sub_f32_e32 v4, v6, v3 +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v3 +; VI-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v8, v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v7, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v1 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 +; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v4 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v4, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v3 @@ -927,12 +941,12 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm ; @@ -1005,55 +1019,58 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; GFX900-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; GFX900-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a +; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v3 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf -; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s10, v1 -; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x7f800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x411a209b -; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 -; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s4, -v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s5, v4 -; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s10 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX900-SDAG-NEXT: v_ldexp_f32 v2, s9, v2 -; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v2 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v4 -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v4, s4, -v1 -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v4, s5, v6 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s10 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[2:3] -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s4, -v4 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s5, v6 -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s10 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 -; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX900-SDAG-NEXT: v_fma_f32 v6, v3, s4, -v5 +; GFX900-SDAG-NEXT: v_ldexp_f32 v7, s1, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v3, s5, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s8 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v7 +; GFX900-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v7, s4, -v3 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v7, s5, v5 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v1 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s8 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v5 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v5, s4, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v5, s5, v6 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s8 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 +; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_v3f32: @@ -1113,60 +1130,52 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s6 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x411a209b, s3 +; GFX1100-SDAG-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s6 +; GFX1100-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; GFX1100-SDAG-NEXT: s_and_b32 s6, s6, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s6, 32, 0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s2, s3 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v4, s1, s6 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x411a209b, s7 +; GFX1100-SDAG-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 32, 0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v2 :: v_dual_lshlrev_b32 v0, 5, v0 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s2, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_fma_f32 v8, 0x3e9a209a, v2, -v5 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v8, 0x3284fbcf, v2 -; GFX1100-SDAG-NEXT: v_add_f32_e32 v5, v5, v8 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v0 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fma_f32 v6, 0x3e9a209a, v0, -v3 -; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v6, 0x3284fbcf, v0 :: v_dual_lshlrev_b32 v1, 5, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s1, v1 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v5, s0, s2 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v1 -; GFX1100-SDAG-NEXT: v_fma_f32 v7, 0x3e9a209a, v1, -v4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 -; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, 0x3e9a209a, v2 :: v_dual_mul_f32 v7, 0x3e9a209a, v4 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| -; GFX1100-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v3, v2, v5, vcc_lo -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6 +; GFX1100-SDAG-NEXT: v_fma_f32 v9, 0x3e9a209a, v2, -v6 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fma_f32 v10, 0x3e9a209a, v4, -v7 +; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v9, 0x3284fbcf, v2 :: v_dual_fmac_f32 v10, 0x3284fbcf, v4 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_dual_add_f32 v6, v6, v9 :: v_dual_add_f32 v7, v7, v10 +; GFX1100-SDAG-NEXT: v_fma_f32 v11, 0x3e9a209a, v5, -v8 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v4| +; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v11, 0x3284fbcf, v5 :: v_dual_sub_f32 v2, v2, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_add_f32_e32 v8, v8, v11 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v5| +; GFX1100-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_sub_f32 v1, v4, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] +; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log10_v3f32: @@ -1387,68 +1396,72 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-LABEL: s_log10_v4f32: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-SDAG-NEXT: s_mov_b32 s12, 0x3284fbcf -; SI-SDAG-NEXT: s_mov_b32 s13, 0x7f800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3e9a209a ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s11, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s11, 0x3e9a209a -; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 -; SI-SDAG-NEXT: v_fma_f32 v3, v1, s11, -v2 -; SI-SDAG-NEXT: v_fma_f32 v3, v1, s12, v3 -; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, s10, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s11, v3 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v3 +; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s6, -v4 +; SI-SDAG-NEXT: s_mov_b32 s7, 0x3284fbcf +; SI-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, v5 +; SI-SDAG-NEXT: s_mov_b32 s11, 0x7f800000 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v6, s10, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s11 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v2 -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s11, -v1 -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s12, v5 -; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v5, s9, v5 -; SI-SDAG-NEXT: v_log_f32_e32 v5, v5 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1] -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; SI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v5 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s8, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v5, s11, -v1 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v5, s12, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 -; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v0, s11, -v5 -; SI-SDAG-NEXT: v_fma_f32 v6, v0, s12, v6 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; SI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v6 +; SI-SDAG-NEXT: v_mov_b32_e32 v7, s4 +; SI-SDAG-NEXT: v_fma_f32 v4, v6, s6, -v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v7, s9, v7 +; SI-SDAG-NEXT: v_fma_f32 v4, v6, s7, v4 +; SI-SDAG-NEXT: v_log_f32_e32 v7, v7 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s11 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5 +; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v7 +; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; SI-SDAG-NEXT: v_fma_f32 v6, v7, s6, -v5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; SI-SDAG-NEXT: v_fma_f32 v6, v7, s7, v6 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s8, v1 ; SI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] -; SI-SDAG-NEXT: s_mov_b32 s6, -1 -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-SDAG-NEXT: v_log_f32_e32 v6, v1 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s11 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v6 +; SI-SDAG-NEXT: v_fma_f32 v5, v6, s6, -v4 +; SI-SDAG-NEXT: v_fma_f32 v5, v6, s7, v5 +; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s11 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log10_v4f32: @@ -1520,84 +1533,88 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 ; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s11, v1 -; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; VI-SDAG-NEXT: v_ldexp_f32 v2, s10, v2 -; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v5, v2, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s6, 32, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s6 +; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v5, s9, v5 -; VI-SDAG-NEXT: v_log_f32_e32 v5, v5 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1] -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; VI-SDAG-NEXT: v_sub_f32_e32 v6, v5, v1 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v1 -; VI-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 -; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[2:3] -; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 -; VI-SDAG-NEXT: v_and_b32_e32 v5, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, v0, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v6 +; VI-SDAG-NEXT: v_mov_b32_e32 v6, s3 +; VI-SDAG-NEXT: v_ldexp_f32 v6, s2, v6 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s8 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v6 +; VI-SDAG-NEXT: v_sub_f32_e32 v4, v6, v2 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v2 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: v_add_f32_e32 v4, v8, v4 +; VI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v7, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 +; VI-SDAG-NEXT: v_ldexp_f32 v7, s1, v7 +; VI-SDAG-NEXT: v_log_f32_e32 v7, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s8 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5 +; VI-SDAG-NEXT: v_and_b32_e32 v5, 0xfffff000, v7 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-SDAG-NEXT: v_sub_f32_e32 v6, v7, v5 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3e9a2000, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v5 +; VI-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; VI-SDAG-NEXT: v_add_f32_e32 v6, v9, v6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v1 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s8 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v6 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v6, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s8 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1690,67 +1707,71 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a -; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; GFX900-SDAG-NEXT: s_mov_b32 s8, 0x3e9a209a ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s11, v1 -; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: s_mov_b32 s11, 0x7f800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x411a209b -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 -; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s4, -v2 -; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s5, v3 -; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX900-SDAG-NEXT: v_ldexp_f32 v2, s10, v2 -; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v2, s4, -v1 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v2, s5, v6 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v6, 5, v6 -; GFX900-SDAG-NEXT: v_ldexp_f32 v6, s9, v6 -; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v6 -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v6, s4, -v1 -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v6, s5, v7 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[2:3] -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v6 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s4, -v6 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s5, v7 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v3 +; GFX900-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; GFX900-SDAG-NEXT: s_mov_b32 s9, 0x3284fbcf +; GFX900-SDAG-NEXT: v_fma_f32 v6, v3, s8, -v5 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, s3 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v3, s9, v6 +; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x7f800000 +; GFX900-SDAG-NEXT: v_ldexp_f32 v7, s2, v7 +; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s10 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v7 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v8, s2 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v7, s8, -v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v8, s1, v8 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v7, s9, v5 +; GFX900-SDAG-NEXT: v_log_f32_e32 v8, v8 +; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s10 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v8 +; GFX900-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v8, s8, -v6 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v8, s9, v7 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; GFX900-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[0:1] -; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 +; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v1 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v8|, s10 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v7, s8, -v5 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v7, s9, v6 +; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s10 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v5, v0 ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; @@ -1824,68 +1845,61 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s1 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s9 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s6 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s1, v2 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s0, v3 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x411a209b, s6 +; GFX1100-SDAG-NEXT: s_and_b32 s6, s6, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s6, 32, 0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s7 +; GFX1100-SDAG-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s3, s6 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8 +; GFX1100-SDAG-NEXT: s_cselect_b32 s7, 32, 0 +; GFX1100-SDAG-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; GFX1100-SDAG-NEXT: s_and_b32 s6, s9, exec_lo +; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s2, s7 +; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v6, s1, s3 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v7, s0, s2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x411a209b, s9 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v8, 0x3e9a209a, v2 :: v_dual_mul_f32 v9, 0x3e9a209a, v3 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a209a, v2 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v3 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_fma_f32 v12, 0x3e9a209a, v2, -v7 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_fma_f32 v13, 0x3e9a209a, v3, -v8 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v10, 0x3e9a209a, v6 :: v_dual_mul_f32 v11, 0x3e9a209a, v7 +; GFX1100-SDAG-NEXT: v_fma_f32 v12, 0x3e9a209a, v2, -v8 +; GFX1100-SDAG-NEXT: v_fma_f32 v13, 0x3e9a209a, v3, -v9 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_fma_f32 v14, 0x3e9a209a, v6, -v10 +; GFX1100-SDAG-NEXT: v_fma_f32 v15, 0x3e9a209a, v7, -v11 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3 -; GFX1100-SDAG-NEXT: v_add_f32_e32 v7, v7, v12 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_add_f32 v8, v8, v13 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v1 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_fma_f32 v10, 0x3e9a209a, v0, -v5 -; GFX1100-SDAG-NEXT: v_fma_f32 v11, 0x3e9a209a, v1, -v6 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v10, 0x3284fbcf, v0 :: v_dual_fmac_f32 v11, 0x3284fbcf, v1 -; GFX1100-SDAG-NEXT: v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc_lo +; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v14, 0x3284fbcf, v6 :: v_dual_fmac_f32 v15, 0x3284fbcf, v7 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_dual_add_f32 v8, v8, v12 :: v_dual_add_f32 v9, v9, v13 +; GFX1100-SDAG-NEXT: v_dual_add_f32 v10, v10, v14 :: v_dual_add_f32 v11, v11, v15 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_sub_f32 v2, v1, v9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc_lo -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4 -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15 +; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v8, v3, v9 :: v_dual_mov_b32 v9, 0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v6| +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v7| +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1] +; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log10_v4f32: @@ -2143,8 +2157,7 @@ define float @v_log10_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -2189,8 +2202,7 @@ define float @v_log10_f32(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -2241,8 +2253,7 @@ define float @v_log10_f32(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -2286,22 +2297,21 @@ define float @v_log10_f32(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2347,8 +2357,7 @@ define float @v_log10_fabs_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, |v0|, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -2393,8 +2402,7 @@ define float @v_log10_fabs_f32(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -2445,8 +2453,7 @@ define float @v_log10_fabs_f32(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -2491,22 +2498,20 @@ define float @v_log10_fabs_f32(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, s0 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2554,8 +2559,7 @@ define float @v_log10_fneg_fabs_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, -|v0|, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -2600,8 +2604,7 @@ define float @v_log10_fneg_fabs_f32(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -2652,8 +2655,7 @@ define float @v_log10_fneg_fabs_f32(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -2698,22 +2700,20 @@ define float @v_log10_fneg_fabs_f32(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e64 s0, 0x80800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, s0 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2762,8 +2762,7 @@ define float @v_log10_fneg_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, -v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -2808,8 +2807,7 @@ define float @v_log10_fneg_f32(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, -v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -2860,8 +2858,7 @@ define float @v_log10_fneg_f32(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, -v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -2905,22 +2902,21 @@ define float @v_log10_fneg_f32(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, -v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2968,8 +2964,7 @@ define float @v_log10_f32_fast(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -2995,8 +2990,7 @@ define float @v_log10_f32_fast(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3022,8 +3016,7 @@ define float @v_log10_f32_fast(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3048,12 +3041,10 @@ define float @v_log10_f32_fast(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1 @@ -3088,8 +3079,7 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3115,8 +3105,7 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3142,8 +3131,7 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3168,12 +3156,10 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1 @@ -3208,8 +3194,7 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3235,8 +3220,7 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3262,8 +3246,7 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3288,12 +3271,10 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1 @@ -3328,8 +3309,7 @@ define float @v_log10_f32_ninf(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -3374,8 +3354,7 @@ define float @v_log10_f32_ninf(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -3426,8 +3405,7 @@ define float @v_log10_f32_ninf(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -3471,22 +3449,21 @@ define float @v_log10_f32_ninf(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3532,8 +3509,7 @@ define float @v_log10_f32_afn(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3559,8 +3535,7 @@ define float @v_log10_f32_afn(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3586,8 +3561,7 @@ define float @v_log10_f32_afn(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3612,12 +3586,10 @@ define float @v_log10_f32_afn(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1 @@ -3681,8 +3653,7 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3708,8 +3679,7 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3735,8 +3705,7 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3761,12 +3730,10 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1 @@ -3801,8 +3768,7 @@ define float @v_fabs_log10_f32_afn(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3828,8 +3794,7 @@ define float @v_fabs_log10_f32_afn(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3855,8 +3820,7 @@ define float @v_fabs_log10_f32_afn(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b @@ -3882,11 +3846,10 @@ define float @v_fabs_log10_f32_afn(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, s0 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1 @@ -4063,8 +4026,7 @@ define float @v_log10_f32_nnan(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -4109,8 +4071,7 @@ define float @v_log10_f32_nnan(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -4161,8 +4122,7 @@ define float @v_log10_f32_nnan(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -4206,22 +4166,21 @@ define float @v_log10_f32_nnan(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4407,8 +4366,7 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -4453,8 +4411,7 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -4505,8 +4462,7 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -4550,22 +4506,21 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4751,8 +4706,7 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -4797,8 +4751,7 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -4849,8 +4802,7 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -4894,22 +4846,21 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4955,8 +4906,7 @@ define float @v_log10_f32_nnan_ninf(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -4995,8 +4945,7 @@ define float @v_log10_f32_nnan_ninf(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -5041,8 +4990,7 @@ define float @v_log10_f32_nnan_ninf(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -5080,20 +5028,18 @@ define float @v_log10_f32_nnan_ninf(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5240,8 +5186,7 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -5280,8 +5225,7 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -5326,8 +5270,7 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -5365,20 +5308,18 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5451,8 +5392,7 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -5497,8 +5437,7 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 @@ -5549,8 +5488,7 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -5594,22 +5532,21 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6132,8 +6069,7 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -6302,8 +6238,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x800000 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-NEXT: v_log_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -6326,8 +6261,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) { ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: s_mov_b32 s4, 0x800000 ; VI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-NEXT: v_log_f32_e32 v0, v0 ; VI-NEXT: s_mov_b32 s4, 0x7f800000 @@ -6353,8 +6287,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) { ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX900-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-NEXT: v_log_f32_e32 v0, v0 ; GFX900-NEXT: s_mov_b32 s4, 0x3e9a209a @@ -6377,22 +6310,20 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) { ; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo ; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 8b3b79b0b1bdd..ebfc953a6bb96 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -14,18 +14,19 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log2_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-SDAG-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 ; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -52,16 +53,17 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log2_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dword s6, s[4:5], 0x2c ; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s6, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -90,17 +92,18 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log2_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-SDAG-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s6, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] @@ -130,13 +133,12 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: s_and_b32 s0, s0, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s3, 32, 0 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s2, s3 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 @@ -221,14 +223,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s3, v3 +; SI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s2, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v4, v1 @@ -271,14 +275,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v4, v1 @@ -322,14 +328,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX900-SDAG-NEXT: s_cselect_b32 s4, 32, 0 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX900-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; GFX900-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v1 @@ -365,26 +373,24 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-SDAG-LABEL: s_log2_v2f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s4 +; GFX1100-SDAG-NEXT: s_and_b32 s4, s4, exec_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s5 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s3, v1 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; GFX1100-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; GFX1100-SDAG-NEXT: s_and_b32 s5, s5, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s5, 32, 0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s3, s4 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s2, s5 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2 +; GFX1100-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v0 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v2 ; GFX1100-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -488,20 +494,23 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s1, v3 +; SI-SDAG-NEXT: s_cselect_b32 s1, 32, 0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s1, v3 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v5, s0, v5 +; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s2, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v5, s0, v5 ; SI-SDAG-NEXT: v_log_f32_e32 v7, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v5, v5 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc @@ -555,19 +564,22 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s2 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; VI-SDAG-NEXT: v_ldexp_f32 v5, s1, v5 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_log_f32_e32 v5, v5 ; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 @@ -622,19 +634,22 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s3, 32, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; GFX900-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, s2 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; GFX900-SDAG-NEXT: v_ldexp_f32 v5, s1, v5 +; GFX900-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v5 @@ -682,35 +697,31 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: s_clause 0x1 ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s6 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; GFX1100-SDAG-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s6 +; GFX1100-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; GFX1100-SDAG-NEXT: s_and_b32 s6, s6, exec_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; GFX1100-SDAG-NEXT: s_cselect_b32 s6, 32, 0 +; GFX1100-SDAG-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s2, s3 +; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v4, s1, s6 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v5, s0, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_ldexp_f32 v4, s1, v4 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v5, s0, v5 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v1, v4, v1 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s2, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v2, v0 +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3 ; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[4:5] ; GFX1100-SDAG-NEXT: s_endpgm @@ -852,24 +863,28 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-SDAG-NEXT: s_cselect_b32 s8, 32, 0 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, s8 +; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s7, v3 +; SI-SDAG-NEXT: s_cselect_b32 s7, 32, 0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v5, s7 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v5, s6, v5 +; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-SDAG-NEXT: s_cselect_b32 s6, 32, 0 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v7, 5, v7 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s7, v3 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v5, s6, v5 +; SI-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-SDAG-NEXT: v_ldexp_f32_e32 v7, s5, v7 +; SI-SDAG-NEXT: s_cselect_b32 s5, 32, 0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s4, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v5, v5 @@ -930,26 +945,30 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; VI-SDAG-NEXT: s_cselect_b32 s6, 32, 0 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s6 +; VI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; VI-SDAG-NEXT: v_ldexp_f32 v5, s2, v5 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; VI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: v_ldexp_f32 v7, s1, v7 +; VI-SDAG-NEXT: s_cselect_b32 s1, 32, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 -; VI-SDAG-NEXT: v_ldexp_f32 v5, s2, v5 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v7, 5, v7 -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_log_f32_e32 v5, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v7, s1, v7 ; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v7, v7 ; VI-SDAG-NEXT: v_log_f32_e32 v8, v1 @@ -1011,24 +1030,28 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX900-SDAG-NEXT: s_cselect_b32 s4, 32, 0 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX900-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; GFX900-SDAG-NEXT: s_cselect_b32 s3, 32, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, s3 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; GFX900-SDAG-NEXT: v_ldexp_f32 v6, s2, v6 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v6, 5, v6 -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v8, 5, v8 -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 -; GFX900-SDAG-NEXT: v_ldexp_f32 v6, s2, v6 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v8, s2 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; GFX900-SDAG-NEXT: v_ldexp_f32 v8, s1, v8 +; GFX900-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 @@ -1085,42 +1108,37 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG-NEXT: s_clause 0x1 ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 0, 1, s9 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s6 +; GFX1100-SDAG-NEXT: s_and_b32 s6, s6, exec_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s7 +; GFX1100-SDAG-NEXT: s_cselect_b32 s6, 32, 0 +; GFX1100-SDAG-NEXT: s_and_b32 s7, s7, exec_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v7, 5, v7 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s3, v2 -; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_ldexp_f32 v7, s0, v7 +; GFX1100-SDAG-NEXT: s_cselect_b32 s7, 32, 0 +; GFX1100-SDAG-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s3, s6 +; GFX1100-SDAG-NEXT: s_cselect_b32 s3, 32, 0 +; GFX1100-SDAG-NEXT: s_and_b32 s6, s9, exec_lo +; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s2, s7 +; GFX1100-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v6, s1, s3 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v7, s0, s2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v6, 5, v6 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v8, v3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_3) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v2, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_ldexp_f32 v6, s1, v6 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v8, v1 -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v7, v5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v1, v6, v4 +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5 ; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[4:5] ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -1272,8 +1290,7 @@ define float @v_log2_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1300,8 +1317,7 @@ define float @v_log2_f32(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1328,8 +1344,7 @@ define float @v_log2_f32(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1355,12 +1370,10 @@ define float @v_log2_f32(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1400,8 +1413,7 @@ define float @v_log2_fabs_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1428,8 +1440,7 @@ define float @v_log2_fabs_f32(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1456,8 +1467,7 @@ define float @v_log2_fabs_f32(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1484,11 +1494,10 @@ define float @v_log2_fabs_f32(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1529,8 +1538,7 @@ define float @v_log2_fneg_fabs_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, -|v0|, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1557,8 +1565,7 @@ define float @v_log2_fneg_fabs_f32(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1585,8 +1592,7 @@ define float @v_log2_fneg_fabs_f32(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1613,11 +1619,10 @@ define float @v_log2_fneg_fabs_f32(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e64 s0, 0x80800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1659,8 +1664,7 @@ define float @v_log2_fneg_f32(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, -v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1687,8 +1691,7 @@ define float @v_log2_fneg_f32(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, -v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1715,8 +1718,7 @@ define float @v_log2_fneg_f32(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x80800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, -v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1742,12 +1744,10 @@ define float @v_log2_fneg_f32(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, -v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1788,8 +1788,7 @@ define float @v_log2_f32_fast(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1816,8 +1815,7 @@ define float @v_log2_f32_fast(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1844,8 +1842,7 @@ define float @v_log2_f32_fast(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1871,12 +1868,10 @@ define float @v_log2_f32_fast(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1916,8 +1911,7 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1944,8 +1938,7 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1972,8 +1965,7 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -1999,12 +1991,10 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2044,8 +2034,7 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2072,8 +2061,7 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2100,8 +2088,7 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2127,12 +2114,10 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2172,8 +2157,7 @@ define float @v_log2_f32_ninf(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2200,8 +2184,7 @@ define float @v_log2_f32_ninf(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2228,8 +2211,7 @@ define float @v_log2_f32_ninf(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2255,12 +2237,10 @@ define float @v_log2_f32_ninf(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2300,8 +2280,7 @@ define float @v_log2_f32_afn(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2328,8 +2307,7 @@ define float @v_log2_f32_afn(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2356,8 +2334,7 @@ define float @v_log2_f32_afn(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2383,12 +2360,10 @@ define float @v_log2_f32_afn(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2454,8 +2429,7 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2482,8 +2456,7 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2510,8 +2483,7 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2537,12 +2509,10 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2582,8 +2552,7 @@ define float @v_fabs_log2_f32_afn(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2610,8 +2579,7 @@ define float @v_fabs_log2_f32_afn(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2638,8 +2606,7 @@ define float @v_fabs_log2_f32_afn(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2666,11 +2633,10 @@ define float @v_fabs_log2_f32_afn(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2737,8 +2703,7 @@ define float @v_log2_f32_nnan(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2765,8 +2730,7 @@ define float @v_log2_f32_nnan(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2793,8 +2757,7 @@ define float @v_log2_f32_nnan(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2820,12 +2783,10 @@ define float @v_log2_f32_nnan(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2891,8 +2852,7 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2919,8 +2879,7 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2947,8 +2906,7 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -2974,12 +2932,10 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -3045,8 +3001,7 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3073,8 +3028,7 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3101,8 +3055,7 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3128,12 +3081,10 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -3173,8 +3124,7 @@ define float @v_log2_f32_nnan_ninf(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3201,8 +3151,7 @@ define float @v_log2_f32_nnan_ninf(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3229,8 +3178,7 @@ define float @v_log2_f32_nnan_ninf(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3256,12 +3204,10 @@ define float @v_log2_f32_nnan_ninf(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -3327,8 +3273,7 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3355,8 +3300,7 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3383,8 +3327,7 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3410,12 +3353,10 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -3481,8 +3422,7 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3509,8 +3449,7 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3537,8 +3476,7 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3564,12 +3502,10 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -3738,8 +3674,7 @@ define float @v_log2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3806,8 +3741,7 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x800000 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; SI-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-NEXT: v_log_f32_e32 v0, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3821,8 +3755,7 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) { ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: s_mov_b32 s4, 0x800000 ; VI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc ; VI-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-NEXT: v_log_f32_e32 v0, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3836,8 +3769,7 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) { ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX900-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-NEXT: v_log_f32_e32 v0, v0 ; GFX900-NEXT: v_mov_b32_e32 v1, 0x42000000 @@ -3851,11 +3783,10 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) { ; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo ; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX1100-NEXT: v_lshlrev_b32_e32 v2, 5, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 4532571d5cf2a..e828a12442fb8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -42,11 +42,7 @@ define half @v_maximum_f16(half %src0, half %src1) { ; GFX950-LABEL: v_maximum_f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16: @@ -96,11 +92,17 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) { ; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f16__nnan: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f16__nnan: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f16__nnan: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nnan: ; GFX10: ; %bb.0: @@ -162,11 +164,7 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) { ; GFX950-LABEL: v_maximum_f16__nsz: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nsz: @@ -216,11 +214,17 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) { ; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f16__nnan_nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f16__nnan_nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f16__nnan_nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nnan_nsz: ; GFX10: ; %bb.0: @@ -286,11 +290,7 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nnan_src0: @@ -367,11 +367,7 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nnan_src1: @@ -458,12 +454,9 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX950-LABEL: s_maximum_f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, s1 -; GFX950-NEXT: v_max_f16_e32 v1, s0, v0 -; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use v0 @@ -2505,3 +2498,4 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} +; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll index 10f744d7d50ad..8b1ba393c8de8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll @@ -1727,11 +1727,6 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-LABEL: v_maximum_v16f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v31, s30, 0 -; GFX7-NEXT: v_writelane_b32 v31, s31, 1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 @@ -1743,7 +1738,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_max_f32_e32 v19, v0, v16 ; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 ; GFX7-NEXT: v_max_f32_e32 v16, v14, v30 -; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX7-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 ; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1765,8 +1760,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] -; GFX7-NEXT: v_readlane_b32 s30, v31, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[40:41] ; GFX7-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] ; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] @@ -1780,25 +1774,15 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] ; GFX7-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] ; GFX7-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] -; GFX7-NEXT: v_readlane_b32 s31, v31, 1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_max_f32_e32 v16, v15, v17 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 ; GFX7-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v16f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v31, s30, 0 -; GFX8-NEXT: v_writelane_b32 v31, s31, 1 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 @@ -1810,7 +1794,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_max_f32_e32 v19, v0, v16 ; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 ; GFX8-NEXT: v_max_f32_e32 v16, v14, v30 -; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX8-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 ; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1832,8 +1816,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 ; GFX8-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] -; GFX8-NEXT: v_readlane_b32 s30, v31, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[40:41] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] @@ -1847,25 +1830,15 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] ; GFX8-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] ; GFX8-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] -; GFX8-NEXT: v_readlane_b32 s31, v31, 1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f32_e32 v16, v15, v17 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v16f32: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v31, s30, 0 -; GFX900-NEXT: v_writelane_b32 v31, s31, 1 ; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX900-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 @@ -1877,7 +1850,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_max_f32_e32 v19, v0, v16 ; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 ; GFX900-NEXT: v_max_f32_e32 v16, v14, v30 -; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 ; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX900-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1899,8 +1872,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 ; GFX900-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] -; GFX900-NEXT: v_readlane_b32 s30, v31, 0 +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[40:41] ; GFX900-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] ; GFX900-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX900-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] @@ -1914,15 +1886,10 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] ; GFX900-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] ; GFX900-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] -; GFX900-NEXT: v_readlane_b32 s31, v31, 1 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_max_f32_e32 v16, v15, v17 ; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 ; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_maximum_v16f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index ccce0156b5499..a106ba207b949 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -2008,13 +2008,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-LABEL: v_maximum_v16f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v34, s34, 0 -; GFX7-NEXT: v_writelane_b32 v34, s35, 1 -; GFX7-NEXT: v_writelane_b32 v34, s30, 2 -; GFX7-NEXT: v_writelane_b32 v34, s31, 3 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2102,18 +2095,16 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX7-NEXT: v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32] ; GFX7-NEXT: v_max_f64 v[28:29], v[28:29], v[31:32] ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[40:41] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX7-NEXT: v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33] ; GFX7-NEXT: v_max_f64 v[30:31], v[30:31], v[32:33] ; GFX7-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX7-NEXT: v_readlane_b32 s30, v34, 2 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] @@ -2128,27 +2119,14 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] ; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] ; GFX7-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX7-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX7-NEXT: v_readlane_b32 s31, v34, 3 -; GFX7-NEXT: v_readlane_b32 s35, v34, 1 -; GFX7-NEXT: v_readlane_b32 s34, v34, 0 -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[40:41] +; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[42:43] +; GFX7-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[42:43] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v16f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v34, s34, 0 -; GFX8-NEXT: v_writelane_b32 v34, s35, 1 -; GFX8-NEXT: v_writelane_b32 v34, s30, 2 -; GFX8-NEXT: v_writelane_b32 v34, s31, 3 ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2236,18 +2214,16 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX8-NEXT: v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32] ; GFX8-NEXT: v_max_f64 v[28:29], v[28:29], v[31:32] ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX8-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[40:41] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX8-NEXT: v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33] ; GFX8-NEXT: v_max_f64 v[30:31], v[30:31], v[32:33] ; GFX8-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX8-NEXT: v_readlane_b32 s30, v34, 2 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] @@ -2262,27 +2238,14 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX8-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] ; GFX8-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] ; GFX8-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX8-NEXT: v_readlane_b32 s31, v34, 3 -; GFX8-NEXT: v_readlane_b32 s35, v34, 1 -; GFX8-NEXT: v_readlane_b32 s34, v34, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[40:41] +; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[42:43] +; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[42:43] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v16f64: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v34, s34, 0 -; GFX900-NEXT: v_writelane_b32 v34, s35, 1 -; GFX900-NEXT: v_writelane_b32 v34, s30, 2 -; GFX900-NEXT: v_writelane_b32 v34, s31, 3 ; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -2370,18 +2333,16 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX900-NEXT: v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32] ; GFX900-NEXT: v_max_f64 v[28:29], v[28:29], v[31:32] ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[40:41] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX900-NEXT: v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33] ; GFX900-NEXT: v_max_f64 v[30:31], v[30:31], v[32:33] ; GFX900-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX900-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX900-NEXT: v_readlane_b32 s30, v34, 2 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc ; GFX900-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] ; GFX900-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] @@ -2396,15 +2357,9 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX900-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] ; GFX900-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] ; GFX900-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX900-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX900-NEXT: v_readlane_b32 s31, v34, 3 -; GFX900-NEXT: v_readlane_b32 s35, v34, 1 -; GFX900-NEXT: v_readlane_b32 s34, v34, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[40:41] +; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[42:43] +; GFX900-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[42:43] ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_maximum_v16f64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 0b9cb9682ea5f..9a2ef15737308 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -30,11 +30,7 @@ define half @v_minimum_f16(half %src0, half %src1) { ; GFX950-LABEL: v_minimum_f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16: @@ -74,11 +70,17 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) { ; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f16__nnan: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f16__nnan: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f16__nnan: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nnan: ; GFX10: ; %bb.0: @@ -127,11 +129,7 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) { ; GFX950-LABEL: v_minimum_f16__nsz: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nsz: @@ -171,11 +169,17 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) { ; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f16__nnan_nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f16__nnan_nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f16__nnan_nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nnan_nsz: ; GFX10: ; %bb.0: @@ -227,11 +231,7 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nnan_src0: @@ -294,11 +294,7 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nnan_src1: @@ -368,12 +364,9 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX950-LABEL: s_minimum_f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, s1 -; GFX950-NEXT: v_min_f16_e32 v1, s0, v0 -; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use v0 @@ -1924,3 +1917,4 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} +; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll index ef7786cbe8a00..7b2998cbd242f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll @@ -1727,11 +1727,6 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-LABEL: v_minimum_v16f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v31, s30, 0 -; GFX7-NEXT: v_writelane_b32 v31, s31, 1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v17 ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 @@ -1743,7 +1738,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_min_f32_e32 v19, v0, v16 ; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 ; GFX7-NEXT: v_min_f32_e32 v16, v14, v30 -; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX7-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 ; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1765,8 +1760,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] -; GFX7-NEXT: v_readlane_b32 s30, v31, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[40:41] ; GFX7-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] ; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] @@ -1780,25 +1774,15 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] ; GFX7-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] ; GFX7-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] -; GFX7-NEXT: v_readlane_b32 s31, v31, 1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_min_f32_e32 v16, v15, v17 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 ; GFX7-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v16f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v31, s30, 0 -; GFX8-NEXT: v_writelane_b32 v31, s31, 1 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v17 ; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 @@ -1810,7 +1794,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_min_f32_e32 v19, v0, v16 ; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 ; GFX8-NEXT: v_min_f32_e32 v16, v14, v30 -; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX8-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 ; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1832,8 +1816,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 ; GFX8-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] -; GFX8-NEXT: v_readlane_b32 s30, v31, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[40:41] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] @@ -1847,25 +1830,15 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] ; GFX8-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] ; GFX8-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] -; GFX8-NEXT: v_readlane_b32 s31, v31, 1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_min_f32_e32 v16, v15, v17 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v16f32: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v31, s30, 0 -; GFX900-NEXT: v_writelane_b32 v31, s31, 1 ; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX900-NEXT: v_min_f32_e32 v1, v1, v17 ; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 @@ -1877,7 +1850,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_min_f32_e32 v19, v0, v16 ; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 ; GFX900-NEXT: v_min_f32_e32 v16, v14, v30 -; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[40:41], v14, v30 ; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX900-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1899,8 +1872,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 ; GFX900-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] -; GFX900-NEXT: v_readlane_b32 s30, v31, 0 +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[40:41] ; GFX900-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] ; GFX900-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] ; GFX900-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] @@ -1914,15 +1886,10 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] ; GFX900-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] ; GFX900-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] -; GFX900-NEXT: v_readlane_b32 s31, v31, 1 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_min_f32_e32 v16, v15, v17 ; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 ; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_minimum_v16f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index fc094e326aa41..9dac7930ebf0f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -2008,13 +2008,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-LABEL: v_minimum_v16f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v34, s34, 0 -; GFX7-NEXT: v_writelane_b32 v34, s35, 1 -; GFX7-NEXT: v_writelane_b32 v34, s30, 2 -; GFX7-NEXT: v_writelane_b32 v34, s31, 3 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2102,18 +2095,16 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX7-NEXT: v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32] ; GFX7-NEXT: v_min_f64 v[28:29], v[28:29], v[31:32] ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[40:41] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX7-NEXT: v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33] ; GFX7-NEXT: v_min_f64 v[30:31], v[30:31], v[32:33] ; GFX7-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX7-NEXT: v_readlane_b32 s30, v34, 2 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] @@ -2128,27 +2119,14 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] ; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] ; GFX7-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX7-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX7-NEXT: v_readlane_b32 s31, v34, 3 -; GFX7-NEXT: v_readlane_b32 s35, v34, 1 -; GFX7-NEXT: v_readlane_b32 s34, v34, 0 -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[40:41] +; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[42:43] +; GFX7-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[42:43] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v16f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v34, s34, 0 -; GFX8-NEXT: v_writelane_b32 v34, s35, 1 -; GFX8-NEXT: v_writelane_b32 v34, s30, 2 -; GFX8-NEXT: v_writelane_b32 v34, s31, 3 ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2236,18 +2214,16 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX8-NEXT: v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32] ; GFX8-NEXT: v_min_f64 v[28:29], v[28:29], v[31:32] ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX8-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[40:41] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX8-NEXT: v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33] ; GFX8-NEXT: v_min_f64 v[30:31], v[30:31], v[32:33] ; GFX8-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX8-NEXT: v_readlane_b32 s30, v34, 2 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] @@ -2262,27 +2238,14 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX8-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] ; GFX8-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] ; GFX8-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX8-NEXT: v_readlane_b32 s31, v34, 3 -; GFX8-NEXT: v_readlane_b32 s35, v34, 1 -; GFX8-NEXT: v_readlane_b32 s34, v34, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[40:41] +; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[42:43] +; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[42:43] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v16f64: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v34, s34, 0 -; GFX900-NEXT: v_writelane_b32 v34, s35, 1 -; GFX900-NEXT: v_writelane_b32 v34, s30, 2 -; GFX900-NEXT: v_writelane_b32 v34, s31, 3 ; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -2370,18 +2333,16 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX900-NEXT: v_cmp_u_f64_e64 s[40:41], v[28:29], v[31:32] ; GFX900-NEXT: v_min_f64 v[28:29], v[28:29], v[31:32] ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[40:41] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX900-NEXT: v_cmp_u_f64_e64 s[42:43], v[30:31], v[32:33] ; GFX900-NEXT: v_min_f64 v[30:31], v[30:31], v[32:33] ; GFX900-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX900-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX900-NEXT: v_readlane_b32 s30, v34, 2 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc ; GFX900-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] ; GFX900-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] @@ -2396,15 +2357,9 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX900-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] ; GFX900-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] ; GFX900-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX900-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX900-NEXT: v_readlane_b32 s31, v34, 3 -; GFX900-NEXT: v_readlane_b32 s35, v34, 1 -; GFX900-NEXT: v_readlane_b32 s34, v34, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[40:41] +; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[42:43] +; GFX900-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[42:43] ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_minimum_v16f64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index d46622ef45f43..94c2e518a9fd3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -115,17 +115,22 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mov_b32_e32 v4, v1 ; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -296,32 +301,40 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mov_b32_e32 v12, v1 ; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v12, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo ; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v12, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo ; GFX12-NEXT: v_add_co_u32 v7, vcc_lo, v7, v10 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo ; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo ; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo ; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo ; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7 ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -771,6 +784,7 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) { ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -843,6 +857,7 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1] ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll index 33e34e38a1837..0ac68c13d2703 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll @@ -1324,53 +1324,21 @@ define void @v_set_rounding_select_1_3(i32 %cond) { } define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) { -; GFX6-LABEL: s_set_rounding_select_2_0: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_cmp_eq_u32 s4, 0 -; GFX6-NEXT: s_cselect_b64 s[34:35], -1, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX6-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0 -; GFX6-NEXT: v_readfirstlane_b32 s34, v0 -; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 -; GFX6-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: s_set_rounding_select_2_0: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_cmp_eq_u32 s4, 0 -; GFX7-NEXT: s_cselect_b64 s[34:35], -1, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX7-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0 -; GFX7-NEXT: v_readfirstlane_b32 s34, v0 -; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: s_set_rounding_select_2_0: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 -; GFX8-NEXT: s_cselect_b64 s[34:35], -1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_mov_b32 s34, 0xa50f -; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s34 -; GFX8-NEXT: v_readfirstlane_b32 s34, v0 -; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX678-LABEL: s_set_rounding_select_2_0: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_cmp_eq_u32 s4, 0 +; GFX678-NEXT: s_movk_i32 s34, 0xa5 +; GFX678-NEXT: s_cselect_b32 s34, s34, 0xa50f +; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 +; GFX678-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: s_set_rounding_select_2_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b64 s[34:35], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_mov_b32 s34, 0xa50f -; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s34 -; GFX9-NEXT: v_readfirstlane_b32 s34, v0 +; GFX9-NEXT: s_movk_i32 s34, 0xa5 +; GFX9-NEXT: s_cselect_b32 s34, s34, 0xa50f ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1378,11 +1346,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s34, -1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s34 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f -; GFX10-NEXT: v_readfirstlane_b32 s34, v0 +; GFX10-NEXT: s_movk_i32 s34, 0xa5 +; GFX10-NEXT: s_cselect_b32 s34, s34, 0xa50f ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1390,11 +1355,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_movk_i32 s0, 0xa5 +; GFX11-NEXT: s_cselect_b32 s0, s0, 0xa50f ; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %cond, 0 @@ -1530,10 +1492,7 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) { ; GFX678: ; %bb.0: ; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX678-NEXT: s_cmp_eq_u32 s4, 0 -; GFX678-NEXT: s_cselect_b64 s[34:35], -1, 0 -; GFX678-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] -; GFX678-NEXT: v_readfirstlane_b32 s34, v0 -; GFX678-NEXT: s_lshl_b32 s34, s34, 2 +; GFX678-NEXT: s_cselect_b32 s34, 4, 0 ; GFX678-NEXT: s_add_i32 s35, s34, -4 ; GFX678-NEXT: s_min_u32 s34, s34, s35 ; GFX678-NEXT: s_lshl_b32 s36, s34, 2 @@ -1547,10 +1506,7 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b64 s[34:35], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] -; GFX9-NEXT: v_readfirstlane_b32 s34, v0 -; GFX9-NEXT: s_lshl_b32 s34, s34, 2 +; GFX9-NEXT: s_cselect_b32 s34, 4, 0 ; GFX9-NEXT: s_add_i32 s35, s34, -4 ; GFX9-NEXT: s_min_u32 s34, s34, s35 ; GFX9-NEXT: s_lshl_b32 s36, s34, 2 @@ -1564,10 +1520,7 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s34, -1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s34 -; GFX10-NEXT: v_readfirstlane_b32 s34, v0 -; GFX10-NEXT: s_lshl_b32 s34, s34, 2 +; GFX10-NEXT: s_cselect_b32 s34, 4, 0 ; GFX10-NEXT: s_add_i32 s35, s34, -4 ; GFX10-NEXT: s_min_u32 s36, s34, s35 ; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f @@ -1581,10 +1534,7 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: s_cselect_b32 s0, 4, 0 ; GFX11-NEXT: s_add_i32 s1, s0, -4 ; GFX11-NEXT: s_min_u32 s2, s0, s1 ; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 223870950e4b7..fbda0e71a74c6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -6026,7 +6026,6 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 @@ -8138,12 +8137,11 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10014 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10015 ; GFX12-NEXT: s_lshr_b32 s4, s3, 31 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1001e ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416 @@ -8231,8 +8229,8 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10001 ; GFX12-NEXT: s_and_b32 s3, s3, 1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:272 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_lshr_b32 s3, s2, 31 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001e @@ -9475,14 +9473,12 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s94, s13, 29 ; GFX12-NEXT: s_lshr_b32 s78, s13, 26 ; GFX12-NEXT: s_lshr_b32 s88, s13, 27 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000 ; GFX12-NEXT: s_lshr_b32 s66, s13, 24 ; GFX12-NEXT: s_lshr_b32 s74, s13, 25 ; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s96 ; GFX12-NEXT: s_lshr_b32 s56, s13, 22 ; GFX12-NEXT: s_lshr_b32 s62, s13, 23 @@ -9499,21 +9495,18 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s2, s13, 14 ; GFX12-NEXT: s_lshr_b32 s4, s13, 15 ; GFX12-NEXT: v_dual_mov_b32 v6, s93 :: v_dual_mov_b32 v7, s94 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v8, s95 :: v_dual_mov_b32 v9, s78 ; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 ; GFX12-NEXT: s_lshr_b32 s6, s13, 12 ; GFX12-NEXT: s_lshr_b32 s8, s13, 13 ; GFX12-NEXT: v_dual_mov_b32 v10, s79 :: v_dual_mov_b32 v11, s88 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v12, s89 :: v_dual_mov_b32 v13, s66 ; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 ; GFX12-NEXT: s_lshr_b32 s10, s13, 10 ; GFX12-NEXT: s_lshr_b32 s14, s13, 11 ; GFX12-NEXT: v_dual_mov_b32 v14, s67 :: v_dual_mov_b32 v15, s74 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v16, s75 :: v_dual_mov_b32 v17, s56 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 @@ -9526,7 +9519,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s16, s13, 8 ; GFX12-NEXT: s_lshr_b32 s20, s13, 9 ; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v19, s62 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v20, s63 :: v_dual_mov_b32 v21, s44 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 @@ -9658,6 +9650,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s44, s12, 4 ; GFX12-NEXT: s_lshr_b32 s30, s12, 2 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[12:13], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 255a1acbe0086..2afac4e90aa40 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -8875,7 +8875,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 ; GFX12-NEXT: v_dual_mov_b32 v4, s64 :: v_dual_mov_b32 v7, s61 ; GFX12-NEXT: v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s13 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s59 ; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s57 ; GFX12-NEXT: v_dual_mov_b32 v12, s56 :: v_dual_mov_b32 v15, s55 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index efc31fbd5ed9e..b945c7c3def6a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -8756,7 +8756,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s37 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s75 ; GFX12-NEXT: v_dual_mov_b32 v2, s74 :: v_dual_mov_b32 v5, s41 ; GFX12-NEXT: s_lshr_b32 s48, s5, 16 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 4217384cdd5ce..809dbec5e7a73 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -6487,50 +6487,50 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v21 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v8 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index e4602f20f8a37..5fd6deff0fbbb 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -505,7 +505,6 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f64: @@ -697,7 +696,6 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f64__offset: @@ -888,7 +886,6 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f64: @@ -1071,7 +1068,6 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f64__offset: @@ -1274,7 +1270,6 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f16: @@ -1581,7 +1576,6 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f16__offset: @@ -1895,7 +1889,6 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f16: @@ -2190,7 +2183,6 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f16__offset: @@ -2485,7 +2477,6 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_f16__offset__align4: @@ -2721,7 +2712,6 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_f16__offset__align4: @@ -2950,8 +2940,9 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2971,7 +2962,6 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_bf16: @@ -3305,8 +3295,9 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -3326,7 +3317,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_bf16__offset: @@ -3667,8 +3657,9 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -3688,7 +3679,6 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_bf16: @@ -4012,6 +4002,7 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -4031,7 +4022,6 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset: @@ -4356,6 +4346,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -4374,7 +4365,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_ret_bf16__offset__align4: @@ -4646,8 +4636,9 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -4665,7 +4656,6 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset__align4: @@ -7030,7 +7020,6 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB28_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s3, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 @@ -7106,6 +7095,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo @@ -7897,7 +7887,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB29_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s3, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 @@ -7969,6 +7958,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index d419b0cdfdd1a..1e8072460c7a3 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -815,7 +815,6 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f16: @@ -1129,7 +1128,6 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1450,7 +1448,6 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f16: @@ -1753,7 +1750,6 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f16__offset: @@ -2056,7 +2052,6 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_f16__offset__align4: @@ -2300,7 +2295,6 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_f16__offset__align4: @@ -2536,8 +2530,9 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2557,7 +2552,6 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_bf16: @@ -2893,8 +2887,9 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2914,7 +2909,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset: @@ -3257,8 +3251,9 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -3278,7 +3273,6 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_bf16: @@ -3604,6 +3598,7 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -3623,7 +3618,6 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset: @@ -3950,6 +3944,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -3968,7 +3963,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset__align4: @@ -4242,8 +4236,9 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -4261,7 +4256,6 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset__align4: @@ -4542,7 +4536,6 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_v2f16: @@ -4814,7 +4807,6 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_v2f16__offset: @@ -5085,7 +5077,6 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2f16: @@ -5347,7 +5338,6 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2f16__offset: @@ -5609,6 +5599,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3 @@ -5617,6 +5608,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 @@ -5633,7 +5625,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_v2bf16: @@ -5986,6 +5977,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3 @@ -5994,6 +5986,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 @@ -6010,7 +6003,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_ret_v2bf16__offset: @@ -6368,8 +6360,10 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 @@ -6386,7 +6380,6 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16: @@ -6732,8 +6725,10 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 @@ -6750,7 +6745,6 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16__ofset: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 282947afa409a..7249b0b1fc0e3 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -815,7 +815,6 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f16: @@ -1129,7 +1128,6 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1450,7 +1448,6 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f16: @@ -1753,7 +1750,6 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f16__offset: @@ -2056,7 +2052,6 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_f16__offset__align4: @@ -2300,7 +2295,6 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_f16__offset__align4: @@ -2536,8 +2530,9 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2557,7 +2552,6 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_bf16: @@ -2893,8 +2887,9 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2914,7 +2909,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset: @@ -3257,8 +3251,9 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -3278,7 +3273,6 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_bf16: @@ -3604,6 +3598,7 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -3623,7 +3618,6 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset: @@ -3950,6 +3944,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -3968,7 +3963,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset__align4: @@ -4242,8 +4236,9 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -4261,7 +4256,6 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset__align4: @@ -4542,7 +4536,6 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_v2f16: @@ -4814,7 +4807,6 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_v2f16__offset: @@ -5085,7 +5077,6 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2f16: @@ -5347,7 +5338,6 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2f16__offset: @@ -5609,6 +5599,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3 @@ -5617,6 +5608,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 @@ -5633,7 +5625,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_v2bf16: @@ -5986,6 +5977,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3 @@ -5994,6 +5986,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 @@ -6010,7 +6003,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_ret_v2bf16__offset: @@ -6368,8 +6360,10 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 @@ -6386,7 +6380,6 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16: @@ -6732,8 +6725,10 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 @@ -6750,7 +6745,6 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16__ofset: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 967e972e53e29..65e00c50292dc 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -42,7 +42,6 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f32: @@ -254,7 +253,6 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f32__offset: @@ -465,7 +463,6 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f32: @@ -666,7 +663,6 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f32__offset: @@ -875,7 +871,6 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f64: @@ -1092,7 +1087,6 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB5_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f64__offset: @@ -1308,7 +1302,6 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f64: @@ -1514,7 +1507,6 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f64__offset: @@ -1740,7 +1732,6 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f16: @@ -2047,7 +2038,6 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f16__offset: @@ -2361,7 +2351,6 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f16: @@ -2656,7 +2645,6 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f16__offset: @@ -2951,7 +2939,6 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f16__offset__align4: @@ -3187,7 +3174,6 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f16__offset__align4: @@ -3416,8 +3402,9 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -3437,7 +3424,6 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_bf16: @@ -3771,8 +3757,9 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -3792,7 +3779,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_bf16__offset: @@ -4133,8 +4119,9 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -4154,7 +4141,6 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_bf16: @@ -4478,6 +4464,7 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -4497,7 +4484,6 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset: @@ -4822,6 +4808,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -4840,7 +4827,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_bf16__offset__align4: @@ -5112,8 +5098,9 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -5131,7 +5118,6 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset__align4: @@ -5408,7 +5394,6 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_v2f16: @@ -5665,7 +5650,6 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_v2f16__offset: @@ -5920,7 +5904,6 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2f16: @@ -6165,7 +6148,6 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2f16__offset: @@ -6413,6 +6395,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 ; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3 @@ -6421,6 +6404,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 @@ -6437,7 +6421,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_v2bf16: @@ -6790,6 +6773,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 ; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3 @@ -6798,6 +6782,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 @@ -6814,7 +6799,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_v2bf16__offset: @@ -7172,8 +7156,10 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 @@ -7190,7 +7176,6 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16: @@ -7536,8 +7521,10 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 @@ -7554,7 +7541,6 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16__ofset: @@ -7907,7 +7893,6 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode: @@ -8117,7 +8102,6 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode: diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll index 30c8739032c90..b5e4088398977 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -5,28 +5,28 @@ ; Uses llvm.amdgcn.break define amdgpu_kernel void @break_loop(i32 %arg) #0 { -; OPT-LABEL: @break_loop( -; OPT-NEXT: bb: +; OPT-LABEL: define amdgpu_kernel void @break_loop( +; OPT-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[BB:.*]]: ; OPT-NEXT: [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; OPT-NEXT: [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]] -; OPT-NEXT: br label [[BB1:%.*]] -; OPT: bb1: -; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP2:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ] -; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[TMP0:%.*]], [[FLOW]] ] -; OPT-NEXT: [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1 -; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0 -; OPT-NEXT: br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]] -; OPT: bb4: +; OPT-NEXT: [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG]] +; OPT-NEXT: br label %[[BB1:.*]] +; OPT: [[BB1]]: +; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP2:%.*]], %[[FLOW:.*]] ], [ 0, %[[BB]] ] +; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, %[[BB]] ], [ [[TMP0:%.*]], %[[FLOW]] ] +; OPT-NEXT: [[TMP0]] = add i32 [[LSR_IV]], 1 +; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[TMP0]], 0 +; OPT-NEXT: br i1 [[CMP0]], label %[[BB4:.*]], label %[[FLOW]] +; OPT: [[BB4]]: ; OPT-NEXT: [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4 ; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]] -; OPT-NEXT: br label [[FLOW]] -; OPT: Flow: -; OPT-NEXT: [[TMP0]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ] -; OPT-NEXT: [[TMP1:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ true, [[BB1]] ] +; OPT-NEXT: br label %[[FLOW]] +; OPT: [[FLOW]]: +; OPT-NEXT: [[TMP1:%.*]] = phi i1 [ [[CMP1]], %[[BB4]] ], [ true, %[[BB1]] ] ; OPT-NEXT: [[TMP2]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP1]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP3:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP2]]) -; OPT-NEXT: br i1 [[TMP3]], label [[BB9:%.*]], label [[BB1]] -; OPT: bb9: +; OPT-NEXT: br i1 [[TMP3]], label %[[BB9:.*]], label %[[BB1]] +; OPT: [[BB9]]: ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]) ; OPT-NEXT: ret void ; @@ -45,11 +45,8 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 { ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: s_or_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_cmp_gt_i32 s6, -1 -; GCN-NEXT: s_cbranch_scc0 .LBB0_3 -; GCN-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 -; GCN-NEXT: ; implicit-def: $sgpr6 -; GCN-NEXT: s_branch .LBB0_4 -; GCN-NEXT: .LBB0_3: ; %bb4 +; GCN-NEXT: s_cbranch_scc1 .LBB0_3 +; GCN-NEXT: ; %bb.2: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -57,13 +54,13 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 { ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_and_b64 s[8:9], vcc, exec ; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GCN-NEXT: .LBB0_4: ; %Flow +; GCN-NEXT: .LBB0_3: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] ; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN-NEXT: s_cbranch_execnz .LBB0_1 -; GCN-NEXT: ; %bb.5: ; %bb9 +; GCN-NEXT: ; %bb.4: ; %bb9 ; GCN-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -86,28 +83,29 @@ bb9: } define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 { -; OPT-LABEL: @undef_phi_cond_break_loop( -; OPT-NEXT: bb: +; OPT-LABEL: define amdgpu_kernel void @undef_phi_cond_break_loop( +; OPT-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[BB:.*]]: ; OPT-NEXT: [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; OPT-NEXT: [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]] -; OPT-NEXT: br label [[BB1:%.*]] -; OPT: bb1: -; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ] -; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ] +; OPT-NEXT: [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG]] +; OPT-NEXT: br label %[[BB1:.*]] +; OPT: [[BB1]]: +; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], %[[FLOW:.*]] ], [ 0, %[[BB]] ] +; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, %[[BB]] ], [ [[MY_TMP2:%.*]], %[[FLOW]] ] ; OPT-NEXT: [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1 ; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0 -; OPT-NEXT: br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]] -; OPT: bb4: +; OPT-NEXT: br i1 [[CMP0]], label %[[BB4:.*]], label %[[FLOW]] +; OPT: [[BB4]]: ; OPT-NEXT: [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4 ; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]] -; OPT-NEXT: br label [[FLOW]] -; OPT: Flow: -; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ] -; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ undef, [[BB1]] ] +; OPT-NEXT: br label %[[FLOW]] +; OPT: [[FLOW]]: +; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], %[[BB4]] ], [ undef, %[[BB1]] ] +; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], %[[BB4]] ], [ undef, %[[BB1]] ] ; OPT-NEXT: [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]]) -; OPT-NEXT: br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]] -; OPT: bb9: +; OPT-NEXT: br i1 [[TMP1]], label %[[BB9:.*]], label %[[BB1]] +; OPT: [[BB9]]: ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]]) ; OPT-NEXT: store volatile i32 7, ptr addrspace(3) undef, align 4 ; OPT-NEXT: ret void @@ -178,29 +176,30 @@ bb9: ; preds = %Flow @lds = addrspace(3) global i32 undef define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 { -; OPT-LABEL: @constexpr_phi_cond_break_loop( -; OPT-NEXT: bb: +; OPT-LABEL: define amdgpu_kernel void @constexpr_phi_cond_break_loop( +; OPT-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[BB:.*]]: ; OPT-NEXT: [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; OPT-NEXT: [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]] -; OPT-NEXT: br label [[BB1:%.*]] -; OPT: bb1: -; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ] -; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ] +; OPT-NEXT: [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG]] +; OPT-NEXT: br label %[[BB1:.*]] +; OPT: [[BB1]]: +; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], %[[FLOW:.*]] ], [ 0, %[[BB]] ] +; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, %[[BB]] ], [ [[MY_TMP2:%.*]], %[[FLOW]] ] ; OPT-NEXT: [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1 ; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0 ; OPT-NEXT: [[CMP2:%.*]] = icmp ne ptr addrspace(3) inttoptr (i32 4 to ptr addrspace(3)), @lds -; OPT-NEXT: br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]] -; OPT: bb4: +; OPT-NEXT: br i1 [[CMP0]], label %[[BB4:.*]], label %[[FLOW]] +; OPT: [[BB4]]: ; OPT-NEXT: [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4 ; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]] -; OPT-NEXT: br label [[FLOW]] -; OPT: Flow: -; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ] -; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ [[CMP2]], [[BB1]] ] +; OPT-NEXT: br label %[[FLOW]] +; OPT: [[FLOW]]: +; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], %[[BB4]] ], [ undef, %[[BB1]] ] +; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], %[[BB4]] ], [ [[CMP2]], %[[BB1]] ] ; OPT-NEXT: [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]]) -; OPT-NEXT: br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]] -; OPT: bb9: +; OPT-NEXT: br i1 [[TMP1]], label %[[BB9:.*]], label %[[BB1]] +; OPT: [[BB9]]: ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]]) ; OPT-NEXT: store volatile i32 7, ptr addrspace(3) undef, align 4 ; OPT-NEXT: ret void @@ -269,28 +268,29 @@ bb9: ; preds = %Flow } define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 { -; OPT-LABEL: @true_phi_cond_break_loop( -; OPT-NEXT: bb: +; OPT-LABEL: define amdgpu_kernel void @true_phi_cond_break_loop( +; OPT-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[BB:.*]]: ; OPT-NEXT: [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; OPT-NEXT: [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]] -; OPT-NEXT: br label [[BB1:%.*]] -; OPT: bb1: -; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ] -; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ] +; OPT-NEXT: [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG]] +; OPT-NEXT: br label %[[BB1:.*]] +; OPT: [[BB1]]: +; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], %[[FLOW:.*]] ], [ 0, %[[BB]] ] +; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, %[[BB]] ], [ [[MY_TMP2:%.*]], %[[FLOW]] ] ; OPT-NEXT: [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1 ; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0 -; OPT-NEXT: br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]] -; OPT: bb4: +; OPT-NEXT: br i1 [[CMP0]], label %[[BB4:.*]], label %[[FLOW]] +; OPT: [[BB4]]: ; OPT-NEXT: [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4 ; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]] -; OPT-NEXT: br label [[FLOW]] -; OPT: Flow: -; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ] -; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ true, [[BB1]] ] +; OPT-NEXT: br label %[[FLOW]] +; OPT: [[FLOW]]: +; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], %[[BB4]] ], [ undef, %[[BB1]] ] +; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], %[[BB4]] ], [ true, %[[BB1]] ] ; OPT-NEXT: [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]]) -; OPT-NEXT: br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]] -; OPT: bb9: +; OPT-NEXT: br i1 [[TMP1]], label %[[BB9:.*]], label %[[BB1]] +; OPT: [[BB9]]: ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]]) ; OPT-NEXT: store volatile i32 7, ptr addrspace(3) undef, align 4 ; OPT-NEXT: ret void @@ -358,28 +358,29 @@ bb9: ; preds = %Flow } define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 { -; OPT-LABEL: @false_phi_cond_break_loop( -; OPT-NEXT: bb: +; OPT-LABEL: define amdgpu_kernel void @false_phi_cond_break_loop( +; OPT-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[BB:.*]]: ; OPT-NEXT: [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; OPT-NEXT: [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]] -; OPT-NEXT: br label [[BB1:%.*]] -; OPT: bb1: -; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ] -; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ] +; OPT-NEXT: [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG]] +; OPT-NEXT: br label %[[BB1:.*]] +; OPT: [[BB1]]: +; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], %[[FLOW:.*]] ], [ 0, %[[BB]] ] +; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, %[[BB]] ], [ [[MY_TMP2:%.*]], %[[FLOW]] ] ; OPT-NEXT: [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1 ; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0 -; OPT-NEXT: br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]] -; OPT: bb4: +; OPT-NEXT: br i1 [[CMP0]], label %[[BB4:.*]], label %[[FLOW]] +; OPT: [[BB4]]: ; OPT-NEXT: [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4 ; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]] -; OPT-NEXT: br label [[FLOW]] -; OPT: Flow: -; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ] -; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ false, [[BB1]] ] +; OPT-NEXT: br label %[[FLOW]] +; OPT: [[FLOW]]: +; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], %[[BB4]] ], [ undef, %[[BB1]] ] +; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], %[[BB4]] ], [ false, %[[BB1]] ] ; OPT-NEXT: [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]]) -; OPT-NEXT: br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]] -; OPT: bb9: +; OPT-NEXT: br i1 [[TMP1]], label %[[BB9:.*]], label %[[BB1]] +; OPT: [[BB9]]: ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]]) ; OPT-NEXT: store volatile i32 7, ptr addrspace(3) undef, align 4 ; OPT-NEXT: ret void @@ -450,29 +451,30 @@ bb9: ; preds = %Flow ; continue. define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 { -; OPT-LABEL: @invert_true_phi_cond_break_loop( -; OPT-NEXT: bb: +; OPT-LABEL: define amdgpu_kernel void @invert_true_phi_cond_break_loop( +; OPT-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[BB:.*]]: ; OPT-NEXT: [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; OPT-NEXT: [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]] -; OPT-NEXT: br label [[BB1:%.*]] -; OPT: bb1: -; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[FLOW:%.*]] ], [ 0, [[BB:%.*]] ] -; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[MY_TMP2:%.*]], [[FLOW]] ] +; OPT-NEXT: [[MY_TMP:%.*]] = sub i32 [[ID]], [[ARG]] +; OPT-NEXT: br label %[[BB1:.*]] +; OPT: [[BB1]]: +; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], %[[FLOW:.*]] ], [ 0, %[[BB]] ] +; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, %[[BB]] ], [ [[MY_TMP2:%.*]], %[[FLOW]] ] ; OPT-NEXT: [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1 ; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0 -; OPT-NEXT: br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]] -; OPT: bb4: +; OPT-NEXT: br i1 [[CMP0]], label %[[BB4:.*]], label %[[FLOW]] +; OPT: [[BB4]]: ; OPT-NEXT: [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4 ; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]] -; OPT-NEXT: br label [[FLOW]] -; OPT: Flow: -; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ] -; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ true, [[BB1]] ] +; OPT-NEXT: br label %[[FLOW]] +; OPT: [[FLOW]]: +; OPT-NEXT: [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], %[[BB4]] ], [ undef, %[[BB1]] ] +; OPT-NEXT: [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], %[[BB4]] ], [ true, %[[BB1]] ] ; OPT-NEXT: [[MY_TMP3_INV:%.*]] = xor i1 [[MY_TMP3]], true ; OPT-NEXT: [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3_INV]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]]) -; OPT-NEXT: br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]] -; OPT: bb9: +; OPT-NEXT: br i1 [[TMP1]], label %[[BB9:.*]], label %[[BB1]] +; OPT: [[BB9]]: ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]]) ; OPT-NEXT: store volatile i32 7, ptr addrspace(3) undef, align 4 ; OPT-NEXT: ret void diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll index eaf8809d33fc3..e8744c7828d41 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll @@ -35,10 +35,10 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: v_readfirstlane_b32 s5, v5 ; GFX12-NEXT: v_readfirstlane_b32 s6, v6 ; GFX12-NEXT: v_readfirstlane_b32 s7, v7 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -47,7 +47,6 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU ; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr9 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-NEXT: ; %bb.2: @@ -60,6 +59,7 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] @@ -73,7 +73,6 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr8 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB0_3 ; GFX12-NEXT: ; %bb.4: @@ -117,10 +116,10 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: v_readfirstlane_b32 s5, v6 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -129,7 +128,6 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: buffer_load_b32 v0, v9, s[4:7], null offen th:TH_LOAD_LU ; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 ; GFX12-NEXT: ; implicit-def: $vgpr9 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB1_1 ; GFX12-NEXT: ; %bb.2: @@ -142,6 +140,7 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: v_readfirstlane_b32 s5, v2 ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] @@ -155,7 +154,6 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX12-NEXT: ; implicit-def: $vgpr0 ; GFX12-NEXT: ; implicit-def: $vgpr5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB1_3 ; GFX12-NEXT: ; %bb.4: @@ -200,10 +198,10 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: v_readfirstlane_b32 s5, v5 ; GFX12-NEXT: v_readfirstlane_b32 s6, v6 ; GFX12-NEXT: v_readfirstlane_b32 s7, v7 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -212,7 +210,6 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr9 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: @@ -225,6 +222,7 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] @@ -238,7 +236,6 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr8 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB2_3 ; GFX12-NEXT: ; %bb.4: @@ -281,10 +278,10 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) ; GFX12-NEXT: v_readfirstlane_b32 s5, v5 ; GFX12-NEXT: v_readfirstlane_b32 s6, v6 ; GFX12-NEXT: v_readfirstlane_b32 s7, v7 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -293,7 +290,6 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) ; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU ; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX12-NEXT: ; implicit-def: $vgpr9 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: @@ -306,6 +302,7 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] @@ -319,7 +316,6 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr8 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB3_3 ; GFX12-NEXT: ; %bb.4: diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll index 6bd0498a2a4e4..b6ff99214249a 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll @@ -359,10 +359,10 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v7 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe @@ -371,7 +371,6 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr9 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-SDAG-NEXT: ; %bb.2: @@ -384,6 +383,7 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] @@ -397,7 +397,6 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr8 ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB0_3 ; GFX12-SDAG-NEXT: ; %bb.4: @@ -426,7 +425,6 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX12-GISEL-NEXT: s_mov_b32 s4, s9 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 @@ -792,10 +790,10 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v7 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe @@ -804,7 +802,6 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT scope:SCOPE_SYS ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr9 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB1_1 ; GFX12-SDAG-NEXT: ; %bb.2: @@ -817,6 +814,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] @@ -831,7 +829,6 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr8 ; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB1_3 ; GFX12-SDAG-NEXT: ; %bb.4: @@ -860,7 +857,6 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX12-GISEL-NEXT: s_mov_b32 s4, s9 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir index d156a0aef6c17..2353101b43144 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir +++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir @@ -242,7 +242,7 @@ body: | ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY5]] ; CHECK-NEXT: [[S_FF1_I32_B32_:%[0-9]+]]:sreg_32 = S_FF1_I32_B32 [[COPY8]] - ; CHECK-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY]], [[S_FF1_I32_B32_]] + ; CHECK-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[COPY]], [[S_FF1_I32_B32_]] ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY7]], [[V_READLANE_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 1, [[S_FF1_I32_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[COPY8]], [[S_LSHL_B32_]], implicit-def dead $scc @@ -311,7 +311,7 @@ body: | %2:sreg_32 = COPY killed %25 %3:sreg_32 = COPY killed %26 %14:sreg_32 = S_FF1_I32_B32 %3 - %15:sreg_32 = V_READLANE_B32 %8, %14 + %15:sreg_32_xm0 = V_READLANE_B32 %8, %14 %4:sreg_32 = S_ADD_I32 killed %2, killed %15, implicit-def dead $scc %17:sreg_32 = S_LSHL_B32 1, killed %14, implicit-def dead $scc %5:sreg_32 = S_ANDN2_B32 killed %3, killed %17, implicit-def dead $scc diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll index eefa0b23d0c08..92d0a05f35732 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll @@ -84,7 +84,7 @@ define amdgpu_kernel void @calls_f0() { define void @f0() { ; CHECK-LABEL: define void @f0() ; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.module.lds.t, ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 8, !noalias !24 -; CHECK-NEXT: store i8 8, ptr addrspace(3) @llvm.amdgcn.module.lds, align 8, !noalias !24 +; CHECK-NEXT: store i8 8, ptr addrspace(3) @llvm.amdgcn.module.lds, align 8, !noalias !29 ; CHECK-NEXT: ret void store i8 1, ptr addrspace(3) @lds.size.1.align.1, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll index d2d15f5ca4577..c46105893cc62 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll @@ -12,9 +12,9 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(ptr addrs ; CHECK-NEXT: store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, align 16, !tbaa [[TBAA1:![0-9]+]], !noalias !6 ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 [[I]] ; CHECK-NEXT: [[VAL_A:%.*]] = load i32, ptr addrspace(3) [[GEP_A]], align 4, !tbaa [[TBAA1]], !noalias !6 -; CHECK-NEXT: store i32 2, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_PREEXISTING_AA_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), align 16, !tbaa [[TBAA1]], !noalias !6 +; CHECK-NEXT: store i32 2, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_PREEXISTING_AA_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), align 16, !tbaa [[TBAA1]], !noalias !11 ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_PREEXISTING_AA_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), i32 0, i32 [[I]] -; CHECK-NEXT: [[VAL_B:%.*]] = load i32, ptr addrspace(3) [[GEP_B]], align 4, !tbaa [[TBAA1]], !noalias !6 +; CHECK-NEXT: [[VAL_B:%.*]] = load i32, ptr addrspace(3) [[GEP_B]], align 4, !tbaa [[TBAA1]], !noalias !11 ; CHECK-NEXT: [[VAL:%.*]] = add i32 [[VAL_A]], [[VAL_B]] ; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[ARG]], align 4 ; CHECK-NEXT: ret void @@ -48,4 +48,11 @@ bb: ; CHECK:!3 = !{!"int", !4, i64 0} ; CHECK:!4 = !{!"omnipotent char", !5, i64 0} ; CHECK:!5 = !{!"Simple C++ TBAA"} -; CHECK:!6 = !{} +; CHECK:!6 = !{!7, !9} +; CHECK:!7 = distinct !{!7, !8} +; CHECK:!8 = distinct !{!8} +; CHECK:!9 = distinct !{!9, !10} +; CHECK:!10 = distinct !{!10} +; CHECK:!11 = !{!12, !13} +; CHECK:!12 = distinct !{!12, !8} +; CHECK:!13 = distinct !{!13, !10} diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-with-alias-scope.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-with-alias-scope.ll new file mode 100644 index 0000000000000..d8d7fc1d7a3bd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-with-alias-scope.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s + +@a = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4 +@b = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4 +@c = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4 + +define amdgpu_kernel void @ds_load_stores_aainfo(ptr addrspace(1) %arg, i32 %i) { +; GCN-LABEL: ds_load_stores_aainfo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: ds_read2_b32 v[2:3], v4 offset1:1 +; GCN-NEXT: ds_write_b64 v1, v[0:1] offset:512 +; GCN-NEXT: ds_read2_b32 v[4:5], v4 offset0:64 offset1:65 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] +; GCN-NEXT: s_endpgm +bb: + %gep.a = getelementptr inbounds [64 x i32], ptr addrspace(3) @a, i32 0, i32 %i + %gep.b = getelementptr inbounds [64 x i32], ptr addrspace(3) @b, i32 0, i32 %i + + %val.a = load i64, ptr addrspace(3) %gep.a, align 4, !tbaa !0, !alias.scope !6, !noalias !5 + %val.b = load i64, ptr addrspace(3) %gep.b, align 4, !tbaa !0, !alias.scope !6, !noalias !5 + + store i64 1, ptr addrspace(3) @c, align 4, !tbaa !0, !noalias !2 + + %val = add i64 %val.a, %val.b + store i64 %val, ptr addrspace(1) %arg, align 4 + + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) + ret void +} + + !0 = !{!"omnipotent char", !1, i64 0} + !1 = !{!1} + !2 = !{!3} + !3 = distinct !{!3, !4} + !4 = distinct !{!4} + !5 = !{!3} + !6 = !{!7} + !7 = !{!7, !4} diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-with-noalias.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-with-noalias.ll new file mode 100644 index 0000000000000..0d0daeaae547d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-with-noalias.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O3 --amdgpu-lower-module-lds-strategy=module < %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s + +@a = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4 +@b = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4 +@c = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4 + +define amdgpu_kernel void @ds_load_stores_aainfo(ptr addrspace(1) %arg, i32 %i) { +; GCN-LABEL: ds_load_stores_aainfo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: ds_read2_b32 v[2:3], v4 offset1:1 +; GCN-NEXT: ds_write_b64 v1, v[0:1] offset:512 +; GCN-NEXT: ds_read2_b32 v[4:5], v4 offset0:64 offset1:65 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] +; GCN-NEXT: s_endpgm +; CHECK-LABEL: define amdgpu_kernel void @ds_load_stores_aainfo( +; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.ds_load_stores_aainfo.lds, i32 0, i32 [[I]] +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_DS_LOAD_STORES_AAINFO_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.ds_load_stores_aainfo.lds, i32 0, i32 1), i32 0, i32 [[I]] +; CHECK-NEXT: [[VAL_A:%.*]] = load i64, ptr addrspace(3) [[GEP_A]], align 4, !tbaa [[TBAA1:![0-9]+]], !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]] +; CHECK-NEXT: [[VAL_B:%.*]] = load i64, ptr addrspace(3) [[GEP_B]], align 4, !tbaa [[TBAA1]], !alias.scope [[META12:![0-9]+]], !noalias [[META13:![0-9]+]] +; CHECK-NEXT: store i64 1, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_DS_LOAD_STORES_AAINFO_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.ds_load_stores_aainfo.lds, i32 0, i32 2), align 16, !tbaa [[TBAA1]], !alias.scope [[META14:![0-9]+]], !noalias [[META15:![0-9]+]] +; CHECK-NEXT: [[VAL:%.*]] = add i64 [[VAL_A]], [[VAL_B]] +; CHECK-NEXT: store i64 [[VAL]], ptr addrspace(1) [[ARG]], align 4 +; CHECK-NEXT: tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) +; CHECK-NEXT: tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) +; CHECK-NEXT: tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) +; CHECK-NEXT: ret void +; +bb: + %gep.a = getelementptr inbounds [64 x i32], ptr addrspace(3) @a, i32 0, i32 %i + %gep.b = getelementptr inbounds [64 x i32], ptr addrspace(3) @b, i32 0, i32 %i + + %val.a = load i64, ptr addrspace(3) %gep.a, align 4, !tbaa !0, !noalias !5 + %val.b = load i64, ptr addrspace(3) %gep.b, align 4, !tbaa !0, !noalias !5 + + store i64 1, ptr addrspace(3) @c, align 4, !tbaa !0, !noalias !2 + + %val = add i64 %val.a, %val.b + store i64 %val, ptr addrspace(1) %arg, align 4 + + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) + ret void +} + + !0 = !{!"omnipotent char", !1, i64 0} + !1 = !{!1} + !2 = !{!3} + !3 = distinct !{!3, !4} + !4 = distinct !{!4} + !5 = !{!3} +;. +; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0, i64 0} +; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]]} +; CHECK: [[META3]] = distinct !{[[META3]]} +; CHECK: [[META4]] = !{[[META5:![0-9]+]]} +; CHECK: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]} +; CHECK: [[META6]] = distinct !{[[META6]]} +; CHECK: [[META7]] = !{[[META8:![0-9]+]], [[META10:![0-9]+]], [[META11:![0-9]+]]} +; CHECK: [[META8]] = distinct !{[[META8]], [[META9:![0-9]+]]} +; CHECK: [[META9]] = distinct !{[[META9]]} +; CHECK: [[META10]] = distinct !{[[META10]], [[META6]]} +; CHECK: [[META11]] = distinct !{[[META11]], [[META6]]} +; CHECK: [[META12]] = !{[[META10]]} +; CHECK: [[META13]] = !{[[META8]], [[META5]], [[META11]]} +; CHECK: [[META14]] = !{[[META11]]} +; CHECK: [[META15]] = !{[[META8]], [[META5]], [[META10]]} +;. diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll index 007e777d0a61d..d39e4d9a9f14f 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll @@ -60,7 +60,7 @@ define void @f0() { define amdgpu_kernel void @k_f0() { ; MODULE-LABEL: @k_f0( -; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META5:![0-9]+]], !noalias [[META1]] +; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META10:![0-9]+]], !noalias [[META1]] ; MODULE-NEXT: call void @f0() ; MODULE-NEXT: ret void ; @@ -83,9 +83,9 @@ define amdgpu_kernel void @k_f0() { @both.lds = addrspace(3) global i32 undef define void @f_both() { ; MODULE-LABEL: @f_both( -; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META4]] +; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META10]], !noalias [[META11:![0-9]+]] ; MODULE-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 4 -; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META4]] +; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META10]], !noalias [[META11]] ; MODULE-NEXT: ret void ; ; TABLE-LABEL: @f_both( @@ -116,9 +116,9 @@ define void @f_both() { define amdgpu_kernel void @k0_both() { ; MODULE-LABEL: @k0_both( ; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] -; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META1]] +; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META10]], !noalias [[META1]] ; MODULE-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 5 -; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META1]] +; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META10]], !noalias [[META1]] ; MODULE-NEXT: call void @f_both() ; MODULE-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index 382f1a8c3f431..989ef6f981d9d 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -193,7 +193,6 @@ define amdgpu_kernel void @caller() { ; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-SDAG-NEXT: s_endpgm ; @@ -207,7 +206,6 @@ define amdgpu_kernel void @caller() { ; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir index 0fc31ea9d6437..ed22b353b0664 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir @@ -733,3 +733,70 @@ body: | liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc S_ENDPGM 0 ... +--- +name: test_no_sink_permlane_swap +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GFX9-LABEL: name: test_no_sink_permlane_swap + ; GFX9: bb.0: + ; GFX9-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GFX9-NEXT: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]] + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: [[V_PERMLANE32_SWAP_B32_e64_:%[0-9]+]]:vgpr_32, [[V_PERMLANE32_SWAP_B32_e64_1:%[0-9]+]]:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], 0, 0, implicit $exec + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX9-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: S_BRANCH %bb.1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.1: + ; GFX9-NEXT: successors: %bb.2(0x80000000) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_PERMLANE32_SWAP_B32_e64_]], [[V_PERMLANE32_SWAP_B32_e64_1]], implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.2: + ; GFX9-NEXT: successors: %bb.3(0x80000000) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_MAX_I32_e64_]], %bb.1 + ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.3: + ; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]] + bb.0: + successors: %bb.2(0x40000000), %bb.1(0x40000000) + liveins: $vgpr0 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:sreg_64 = S_MOV_B64 0 + %3:vreg_64 = COPY %2 + %4:vgpr_32 = GLOBAL_LOAD_DWORD killed %3, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %5:vgpr_32, %6:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 %4, %4, 0, 0, implicit $exec + %7:vgpr_32 = COPY $vgpr0 + %8:sreg_32 = S_MOV_B32 1 + %9:sreg_64 = V_CMP_LT_I32_e64 %7, %8, implicit $exec + %10:sreg_64 = COPY %9 + %11:sreg_64 = SI_IF %10, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + %12:vgpr_32 = V_MAX_I32_e64 %5, %6, implicit $exec + + bb.2: + successors: %bb.3(0x80000000) + + %13:vgpr_32 = PHI %1, %bb.0, %12, %bb.1 + SI_END_CF %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + + bb.3: + S_ENDPGM 0, implicit %13 +... diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index c826980991f94..e96c66faf493c 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -43,25 +43,25 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_addc_u32 s13, s13, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; CHECK-NEXT: s_load_dwordx8 s[48:55], s[8:9], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[64:71], s[8:9], 0x0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 -; CHECK-NEXT: s_add_u32 s44, s34, 40 +; CHECK-NEXT: s_add_u32 s52, s34, 40 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_mov_b32 s33, s16 -; CHECK-NEXT: s_addc_u32 s45, s35, 0 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_addc_u32 s53, s35, 0 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_mov_b32 s42, s15 +; CHECK-NEXT: s_mov_b32 s50, s15 ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12 @@ -70,12 +70,12 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v43, v0 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4 @@ -84,12 +84,12 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v41, v0 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360 ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -99,15 +99,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0 ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1 -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: global_load_dword v0, v0, s[52:53] -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: global_load_dword v0, v0, s[68:69] +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4 @@ -117,7 +117,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v1, 12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v42, v0 -; CHECK-NEXT: s_mov_b32 s44, exec_lo +; CHECK-NEXT: s_mov_b32 s52, exec_lo ; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_25 ; CHECK-NEXT: ; %bb.1: ; %.preheader5 @@ -136,7 +136,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42 -; CHECK-NEXT: s_mov_b32 s45, 0 +; CHECK-NEXT: s_mov_b32 s53, 0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45 ; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB0_25 @@ -144,57 +144,57 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0 ; CHECK-NEXT: v_mov_b32_e32 v47, 0 -; CHECK-NEXT: s_mov_b32 s47, 0 +; CHECK-NEXT: s_mov_b32 s55, 0 ; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_8 Depth 2 ; CHECK-NEXT: ; Child Loop BB0_20 Depth 2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s47, v44 -; CHECK-NEXT: s_lshl_b32 s4, s47, 5 -; CHECK-NEXT: s_add_i32 s46, s47, 1 -; CHECK-NEXT: s_add_i32 s5, s47, 5 -; CHECK-NEXT: v_or3_b32 v57, s4, v43, s46 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v44 +; CHECK-NEXT: s_lshl_b32 s4, s55, 5 +; CHECK-NEXT: s_add_i32 s54, s55, 1 +; CHECK-NEXT: s_add_i32 s5, s55, 5 +; CHECK-NEXT: v_or3_b32 v57, s4, v43, s54 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_read_u8 v56, v0 -; CHECK-NEXT: v_mov_b32_e32 v58, s46 -; CHECK-NEXT: s_mov_b32 s52, exec_lo +; CHECK-NEXT: v_mov_b32_e32 v58, s54 +; CHECK-NEXT: s_mov_b32 s68, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_17 ; CHECK-NEXT: ; %bb.6: ; %.preheader2 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_mov_b32 s53, 0 -; CHECK-NEXT: s_mov_b32 s56, 0 +; CHECK-NEXT: s_mov_b32 s69, 0 +; CHECK-NEXT: s_mov_b32 s80, 0 ; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 -; CHECK-NEXT: s_add_i32 s56, s56, 4 -; CHECK-NEXT: s_add_i32 s4, s47, s56 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s56, v57 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 +; CHECK-NEXT: s_add_i32 s80, s80, 4 +; CHECK-NEXT: s_add_i32 s4, s55, s80 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s80, v57 ; CHECK-NEXT: s_add_i32 s5, s4, 5 ; CHECK-NEXT: s_add_i32 s4, s4, 1 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42 ; CHECK-NEXT: v_mov_b32_e32 v58, s4 -; CHECK-NEXT: s_or_b32 s53, vcc_lo, s53 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: s_or_b32 s69, vcc_lo, s69 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s69 ; CHECK-NEXT: s_cbranch_execz .LBB0_16 ; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_add_nc_u32_e32 v59, s56, v46 -; CHECK-NEXT: v_add_nc_u32_e32 v58, s56, v57 +; CHECK-NEXT: v_add_nc_u32_e32 v59, s80, v46 +; CHECK-NEXT: v_add_nc_u32_e32 v58, s80, v57 ; CHECK-NEXT: ds_read_u8 v0, v59 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s57, s4 +; CHECK-NEXT: s_and_saveexec_b32 s81, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -204,22 +204,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v58 ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s57, s4 +; CHECK-NEXT: s_and_saveexec_b32 s81, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_12 ; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 @@ -230,22 +230,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 ; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s57, s4 +; CHECK-NEXT: s_and_saveexec_b32 s81, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_14 ; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 @@ -256,22 +256,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 ; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s57, s4 +; CHECK-NEXT: s_and_saveexec_b32 s81, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_7 ; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 @@ -284,27 +284,27 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_branch .LBB0_7 ; CHECK-NEXT: .LBB0_16: ; %Flow45 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69 ; CHECK-NEXT: v_mov_b32_e32 v57, v0 ; CHECK-NEXT: .LBB0_17: ; %Flow46 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 -; CHECK-NEXT: s_mov_b32 s47, exec_lo +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68 +; CHECK-NEXT: s_mov_b32 s55, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_23 ; CHECK-NEXT: ; %bb.18: ; %.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_mov_b32 s52, 0 +; CHECK-NEXT: s_mov_b32 s68, 0 ; CHECK-NEXT: s_inst_prefetch 0x1 ; CHECK-NEXT: s_branch .LBB0_20 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_19: ; in Loop: Header=BB0_20 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69 ; CHECK-NEXT: v_add_nc_u32_e32 v58, 1, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v58, v42 -; CHECK-NEXT: s_or_b32 s52, vcc_lo, s52 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 +; CHECK-NEXT: s_or_b32 s68, vcc_lo, s68 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s68 ; CHECK-NEXT: s_cbranch_execz .LBB0_22 ; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 @@ -312,18 +312,18 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ds_read_u8 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s53, s4 +; CHECK-NEXT: s_and_saveexec_b32 s69, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_19 ; CHECK-NEXT: ; %bb.21: ; in Loop: Header=BB0_20 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -336,31 +336,31 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_22: ; %Flow43 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: s_inst_prefetch 0x2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68 ; CHECK-NEXT: .LBB0_23: ; %Flow44 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s47 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s46, v45 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 -; CHECK-NEXT: s_mov_b32 s47, s46 +; CHECK-NEXT: s_mov_b32 s55, s54 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_or_b32 s45, s4, s45 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s45 +; CHECK-NEXT: s_or_b32 s53, s4, s53 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53 ; CHECK-NEXT: s_cbranch_execnz .LBB0_5 ; CHECK-NEXT: .LBB0_25: ; %Flow51 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 @@ -373,19 +373,19 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v41 ; CHECK-NEXT: s_cbranch_execz .LBB0_33 ; CHECK-NEXT: ; %bb.26: -; CHECK-NEXT: s_mov_b32 s44, 0 +; CHECK-NEXT: s_mov_b32 s52, 0 ; CHECK-NEXT: s_branch .LBB0_28 ; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s45 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z14get_local_sizej@rel32@lo+4 @@ -393,12 +393,12 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41 -; CHECK-NEXT: s_or_b32 s44, vcc_lo, s44 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; CHECK-NEXT: s_or_b32 s52, vcc_lo, s52 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: s_cbranch_execz .LBB0_33 ; CHECK-NEXT: .LBB0_28: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v41 -; CHECK-NEXT: s_mov_b32 s45, exec_lo +; CHECK-NEXT: s_mov_b32 s53, exec_lo ; CHECK-NEXT: ds_read_b32 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_lshrrev_b32_e32 v63, 10, v0 @@ -407,8 +407,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62 ; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72 -; CHECK-NEXT: v_add_co_u32 v2, s4, s48, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s49, 0, s4 +; CHECK-NEXT: v_add_co_u32 v2, s4, s64, v1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s65, 0, s4 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -442,8 +442,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: v_or3_b32 v2, v3, v2, v4 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_or3_b32 v73, v2, v0, v1 ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -454,11 +454,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffc, v0 ; CHECK-NEXT: v_lshlrev_b32_e64 v44, v1, 1 ; CHECK-NEXT: v_and_b32_e32 v74, 28, v1 -; CHECK-NEXT: v_add_co_u32 v42, s4, s54, v0 -; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s55, 0, s4 +; CHECK-NEXT: v_add_co_u32 v42, s4, s70, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s71, 0, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, v44 ; CHECK-NEXT: v_mov_b32_e32 v0, v42 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: v_mov_b32_e32 v1, v43 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_bfe_u32 v0, v0, v74, 4 @@ -469,7 +469,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1 ; CHECK-NEXT: v_xor_b32_e32 v4, v60, v58 ; CHECK-NEXT: v_lshrrev_b64 v[2:3], 16, v[56:57] -; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[50:51] +; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[66:67] ; CHECK-NEXT: v_lshlrev_b32_e32 v10, 5, v0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72 @@ -500,11 +500,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v2, v44 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_subPU3AS1Vjj@rel32@lo+4 @@ -792,25 +792,25 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_addc_u32 s13, s13, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; CHECK-NEXT: s_load_dwordx2 s[46:47], s[8:9], 0x10 +; CHECK-NEXT: s_load_dwordx2 s[54:55], s[8:9], 0x10 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 -; CHECK-NEXT: s_add_u32 s44, s38, 40 +; CHECK-NEXT: s_add_u32 s52, s38, 40 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_mov_b32 s33, s16 -; CHECK-NEXT: s_addc_u32 s45, s39, 0 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_addc_u32 s53, s39, 0 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_mov_b32 s42, s15 +; CHECK-NEXT: s_mov_b32 s50, s15 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7] -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12 @@ -819,12 +819,12 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_mov_b32_e32 v42, v0 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4 @@ -833,12 +833,12 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360 ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -849,15 +849,15 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0 ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1 -; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: global_load_dword v0, v0, s[46:47] -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: global_load_dword v0, v0, s[54:55] +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4 @@ -868,7 +868,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v41, v0 ; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42 -; CHECK-NEXT: s_mov_b32 s44, 0 +; CHECK-NEXT: s_mov_b32 s52, 0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41 @@ -878,12 +878,12 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: ; Child Loop BB1_8 Depth 2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44 ; CHECK-NEXT: s_lshl_b32 s5, s4, 5 -; CHECK-NEXT: s_add_i32 s45, s4, 1 +; CHECK-NEXT: s_add_i32 s53, s4, 1 ; CHECK-NEXT: s_add_i32 s6, s4, 5 -; CHECK-NEXT: v_or3_b32 v47, s5, v42, s45 +; CHECK-NEXT: v_or3_b32 v47, s5, v42, s53 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_read_u8 v46, v0 -; CHECK-NEXT: v_mov_b32_e32 v56, s45 +; CHECK-NEXT: v_mov_b32_e32 v56, s53 ; CHECK-NEXT: s_mov_b32 s5, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41 ; CHECK-NEXT: s_cbranch_execz .LBB1_5 @@ -912,23 +912,23 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: .LBB1_5: ; %Flow4 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; CHECK-NEXT: s_mov_b32 s46, exec_lo +; CHECK-NEXT: s_mov_b32 s54, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41 ; CHECK-NEXT: s_cbranch_execz .LBB1_11 ; CHECK-NEXT: ; %bb.6: ; %.103.preheader ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_mov_b32 s47, 0 +; CHECK-NEXT: s_mov_b32 s55, 0 ; CHECK-NEXT: s_inst_prefetch 0x1 ; CHECK-NEXT: s_branch .LBB1_8 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_7: ; %.114 ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s64 ; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41 -; CHECK-NEXT: s_or_b32 s47, vcc_lo, s47 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s47 +; CHECK-NEXT: s_or_b32 s55, vcc_lo, s55 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: s_cbranch_execz .LBB1_10 ; CHECK-NEXT: .LBB1_8: ; %.103 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 @@ -937,7 +937,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: ds_read_u8 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s48, s4 +; CHECK-NEXT: s_and_saveexec_b32 s64, s4 ; CHECK-NEXT: s_cbranch_execz .LBB1_7 ; CHECK-NEXT: ; %bb.9: ; %.110 ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 @@ -945,11 +945,11 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s38, 40 ; CHECK-NEXT: s_addc_u32 s9, s39, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -962,31 +962,31 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: .LBB1_10: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_inst_prefetch 0x2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s47 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: .LBB1_11: ; %Flow2 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s46 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s54 ; CHECK-NEXT: ; %bb.12: ; %.32 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s45, v45 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v45 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_or_b32 s44, s4, s44 -; CHECK-NEXT: s_mov_b32 s4, s45 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; CHECK-NEXT: s_or_b32 s52, s4, s52 +; CHECK-NEXT: s_mov_b32 s4, s53 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: s_cbranch_execnz .LBB1_1 ; CHECK-NEXT: ; %bb.13: ; %.119 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_add_u32 s8, s38, 40 ; CHECK-NEXT: s_addc_u32 s9, s39, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s43 -; CHECK-NEXT: s_mov_b32 s13, s42 +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 7d18739fd0c32..11c62a7312755 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -381,20 +381,25 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v0, v13, v[9:10] ; GFX12-NEXT: v_mov_b32_e32 v10, v8 ; GFX12-NEXT: v_mad_co_i64_i32 v[8:9], null, v1, v12, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_add_co_u32 v10, s0, v11, v10 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v13, v0, v[8:9] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v12, v13, v[10:11] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v8, vcc_lo, v8, v0 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i128 %sext1 = sext i32 %arg1 to i128 @@ -1161,19 +1166,22 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_xor_b32_e32 v2, v2, v4 ; GFX12-NEXT: v_xor_b32_e32 v3, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0 ; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 @@ -1249,12 +1257,14 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0 ; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 @@ -1795,11 +1805,14 @@ define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = add i64 %arg0, 1 %lsh = lshr i64 %arg0, 32 diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir index d59bcfb16eece..48391cba278c0 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir @@ -730,7 +730,8 @@ body: | ... # GCN-LABEL: name: smfmac16x16_write_vgpr_flat_read # GCN: V_SMFMAC -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: FLAT_STORE_DWORD name: smfmac16x16_write_vgpr_flat_read body: | @@ -741,7 +742,8 @@ body: | # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_flat_read # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: FLAT_STORE_DWORD name: xdl_smfma16x16_write_vgpr_flat_read body: | @@ -752,7 +754,8 @@ body: | # GCN-LABEL: name: smfmac32x32_write_vgpr_flat_read # GCN: V_SMFMAC # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: FLAT_STORE_DWORD name: smfmac32x32_write_vgpr_flat_read body: | @@ -764,7 +767,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: FLAT_STORE_DWORD name: xdl_smfma32x32_write_vgpr_flat_read body: | @@ -819,7 +823,8 @@ body: | # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_read # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32 name: xdl_smfma16x16_write_vgpr_valu_read body: | @@ -831,7 +836,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32 name: xdl_smfma32x32_write_vgpr_valu_read body: | @@ -877,7 +883,8 @@ body: | # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_accv_read # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 name: xdl_smfma16x16_write_vgpr_accv_read body: | @@ -889,7 +896,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 name: xdl_smfma32x32_write_vgpr_accv_read body: | @@ -946,7 +954,8 @@ body: | # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_write # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32 name: xdl_smfma16x16_write_vgpr_valu_write body: | @@ -958,7 +967,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32 name: xdl_smfma32x32_write_vgpr_valu_write body: | @@ -979,7 +989,8 @@ body: | # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_f16_write # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_FMA_F16_e64 name: xdl_smfma16x16_write_vgpr_valu_f16_write body: | @@ -991,7 +1002,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_FMA_F16_e64 name: xdl_smfma32x32_write_vgpr_valu_f16_write body: | @@ -1012,7 +1024,8 @@ body: | # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_sdwa_write # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32_sdwa name: xdl_smfma16x16_write_vgpr_valu_sdwa_write body: | @@ -1024,7 +1037,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32_sdwa name: xdl_smfma32x32_write_vgpr_valu_sdwa_write body: | @@ -1745,7 +1759,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_MOV_B32 name: xdl_sgemm16X16X16_mfma_write_vgpr_valu_write body: | @@ -1755,7 +1770,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_vm_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: BUFFER_STORE_DWORD name: xdl_sgemm16X16X16_mfma_write_vgpr_vm_read body: | @@ -1765,7 +1781,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_valu_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_MOV_B32 name: xdl_sgemm16X16X16_mfma_write_vgpr_valu_read body: | @@ -1775,7 +1792,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_dot_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_DOT name: xdl_sgemm16X16X16_mfma_write_vgpr_dot_read body: | @@ -2052,7 +2070,8 @@ body: | ... # GCN-LABEL: name: smfmac16x16_read_vgpr_srcc_valu_write # GCN: V_SMFMAC -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_MOV_B32 name: smfmac16x16_read_vgpr_srcc_valu_write body: | @@ -2082,7 +2101,8 @@ body: | # GCN-LABEL: name: smfmac32x32_read_vgpr_srcc_valu_write # GCN: V_SMFMAC # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32 name: smfmac32x32_read_vgpr_srcc_valu_write body: | diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir index 433236180b137..e3a4a19b8e2b6 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir @@ -157,19 +157,19 @@ name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__c tracksRegLiveness: true body: | bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-LABEL: name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC - ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -180,18 +180,18 @@ name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__c tracksRegLiveness: true body: | bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-LABEL: name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC - ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -202,19 +202,19 @@ name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_v tracksRegLiveness: true body: | bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-LABEL: name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC - ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -225,18 +225,18 @@ name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_v tracksRegLiveness: true body: | bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-LABEL: name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC - ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -247,18 +247,18 @@ name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_ tracksRegLiveness: true body: | bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-LABEL: name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC - ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 2 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -269,17 +269,17 @@ name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_ tracksRegLiveness: true body: | bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-LABEL: name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC - ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 6 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll index 88aff57d2b9ad..8c5372c59b6e7 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll @@ -5,19 +5,13 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck -check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX942 %s ; We aren't pressuring the SGPRs, so this can use the add with carry out pre-gfx9. define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: ; GFX10_1: ; %bb.0: ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 @@ -30,23 +24,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: ; GFX10_3: ; %bb.0: ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 @@ -59,22 +41,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4044 -; GFX11-NEXT: scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v1, s59, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -90,12 +61,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v1, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4044 -; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: @@ -105,11 +70,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v1, s59, 0 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: s_add_co_ci_u32 s0, s32, 0x4000 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 @@ -124,23 +84,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v1, 0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v1, s59, 0 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-NEXT: ;;#ASMSTART @@ -149,27 +98,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX8-NEXT: s_movk_i32 s59, 0x4040 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s59, v0 -; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec +; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v1, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v1, s59, 0 ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: ;;#ASMSTART @@ -177,47 +115,30 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0 -; GFX900-NEXT: v_readfirstlane_b32 s59, v0 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec +; GFX900-NEXT: v_readfirstlane_b32 s59, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v1, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x4044 -; GFX940-NEXT: scratch_store_dword off, v1, s2 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v1, s59, 0 -; GFX940-NEXT: s_add_i32 s0, s32, 64 -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: s_and_b64 s[0:1], 0, exec -; GFX940-NEXT: s_addc_u32 s0, s32, 0x4040 -; GFX940-NEXT: s_bitcmp1_b32 s0, 0 -; GFX940-NEXT: s_bitset0_b32 s0, 0 -; GFX940-NEXT: s_mov_b32 s59, s0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use alloca0 v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s59, scc -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s59, v1, 0 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x4044 -; GFX940-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_add_i32 s0, s32, 64 +; GFX942-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NEXT: s_and_b64 s[0:1], 0, exec +; GFX942-NEXT: s_addc_u32 s0, s32, 0x4040 +; GFX942-NEXT: s_bitcmp1_b32 s0, 0 +; GFX942-NEXT: s_bitset0_b32 s0, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use alloca0 v0 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s59, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s59, scc +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0) @@ -231,12 +152,6 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 { ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_dead_scc: ; GFX10_1: ; %bb.0: ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_lshr_b32 s59, s32, 5 ; GFX10_1-NEXT: s_addk_i32 s59, 0x4040 @@ -247,23 +162,11 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 { ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_dead_scc: ; GFX10_3: ; %bb.0: ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_lshr_b32 s59, s32, 5 ; GFX10_3-NEXT: s_addk_i32 s59, 0x4040 @@ -274,22 +177,11 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 { ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880 -; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_dead_scc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4044 -; GFX11-NEXT: scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v1, s59, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -301,12 +193,6 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 { ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v1, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4044 -; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_dead_scc: @@ -316,107 +202,63 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v1, s59, 0 ; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 s59, s0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b32 s59, s0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v1, 0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_dead_scc: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v1, s59, 0 -; GFX8-NEXT: s_lshr_b32 s59, s32, 6 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX8-NEXT: s_addk_i32 s59, 0x4040 +; GFX8-NEXT: s_lshr_b32 s59, s32, 6 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use alloca0 v0 ; GFX8-NEXT: ;;#ASMEND +; GFX8-NEXT: s_addk_i32 s59, 0x4040 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59 ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v1, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_dead_scc: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v1, s59, 0 -; GFX900-NEXT: s_lshr_b32 s59, s32, 6 ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX900-NEXT: s_addk_i32 s59, 0x4040 +; GFX900-NEXT: s_lshr_b32 s59, s32, 6 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use alloca0 v0 ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_addk_i32 s59, 0x4040 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59 ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v1, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101100 -; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: scalar_mov_materializes_frame_index_dead_scc: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x4044 -; GFX940-NEXT: scratch_store_dword off, v1, s2 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v1, s59, 0 -; GFX940-NEXT: s_add_i32 s0, s32, 64 -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: s_add_i32 s0, s32, 0x4040 -; GFX940-NEXT: s_mov_b32 s59, s0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use alloca0 v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s59 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s59, v1, 0 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x4044 -; GFX940-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: scalar_mov_materializes_frame_index_dead_scc: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_add_i32 s0, s32, 64 +; GFX942-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NEXT: s_add_i32 s0, s32, 0x4040 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use alloca0 v0 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s59, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s59 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0) @@ -430,14 +272,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_1-NEXT: s_mov_b32 s5, s33 ; GFX10_1-NEXT: s_mov_b32 s33, s32 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80880 -; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000 -; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: s_mov_b32 s32, s33 ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 @@ -450,14 +286,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80880 -; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_1-NEXT: s_mov_b32 s33, s5 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp: @@ -465,13 +294,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_3-NEXT: s_mov_b32 s5, s33 ; GFX10_3-NEXT: s_mov_b32 s33, s32 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80880 -; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000 -; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: s_mov_b32 s32, s33 ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 @@ -484,13 +308,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80880 -; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_3-NEXT: s_mov_b32 s33, s5 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp: @@ -498,12 +316,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s2, s33, 0x4044 -; GFX11-NEXT: scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0x4080 -; GFX11-NEXT: v_writelane_b32 v1, s59, 0 ; GFX11-NEXT: s_add_i32 s0, s33, 64 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -519,13 +332,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v1, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s2, s33, 0x4044 -; GFX11-NEXT: scratch_load_b32 v1, off, s2 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s33, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp: @@ -537,13 +344,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, s33 ; GFX12-NEXT: s_mov_b32 s33, s32 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v1, s33 offset:16388 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_addk_co_i32 s32, 0x4040 -; GFX12-NEXT: v_writelane_b32 v1, s59, 0 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_co_ci_u32 s0, s33, 0x4000 ; GFX12-NEXT: v_mov_b32_e32 v0, s33 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -557,14 +360,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v1, 0 ; GFX12-NEXT: s_mov_b32 s32, s33 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v1, off, s33 offset:16388 ; 4-byte Folded Reload -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_mov_b32 s33, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -573,12 +370,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s7, s33, 0x101100 -; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_add_i32 s32, s32, 0x102000 -; GFX8-NEXT: v_writelane_b32 v1, s59, 0 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-NEXT: ;;#ASMSTART @@ -586,20 +377,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX8-NEXT: s_movk_i32 s59, 0x4040 +; GFX8-NEXT: s_add_i32 s32, s32, 0x102000 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s59, v0 -; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec +; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v1, 0 ; GFX8-NEXT: s_mov_b32 s32, s33 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s7, s33, 0x101100 -; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_mov_b32 s33, s6 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp: @@ -607,67 +393,45 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_mov_b32 s6, s33 ; GFX900-NEXT: s_mov_b32 s33, s32 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s7, s33, 0x101100 -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_add_i32 s32, s32, 0x102000 -; GFX900-NEXT: v_writelane_b32 v1, s59, 0 ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use alloca0 v0 ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; GFX900-NEXT: s_add_i32 s32, s32, 0x102000 ; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0 -; GFX900-NEXT: v_readfirstlane_b32 s59, v0 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec +; GFX900-NEXT: v_readfirstlane_b32 s59, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v1, 0 ; GFX900-NEXT: s_mov_b32 s32, s33 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s7, s33, 0x101100 -; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: s_mov_b32 s33, s6 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b32 s2, s33 -; GFX940-NEXT: s_mov_b32 s33, s32 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s3, s33, 0x4044 -; GFX940-NEXT: scratch_store_dword off, v1, s3 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_addk_i32 s32, 0x4080 -; GFX940-NEXT: v_writelane_b32 v1, s59, 0 -; GFX940-NEXT: s_add_i32 s0, s33, 64 -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: s_and_b64 s[0:1], 0, exec -; GFX940-NEXT: s_addc_u32 s0, s33, 0x4040 -; GFX940-NEXT: s_bitcmp1_b32 s0, 0 -; GFX940-NEXT: s_bitset0_b32 s0, 0 -; GFX940-NEXT: s_mov_b32 s59, s0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use alloca0 v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s59, scc -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s59, v1, 0 -; GFX940-NEXT: s_mov_b32 s32, s33 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s3, s33, 0x4044 -; GFX940-NEXT: scratch_load_dword v1, off, s3 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_mov_b32 s33, s2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b32 s2, s33 +; GFX942-NEXT: s_mov_b32 s33, s32 +; GFX942-NEXT: s_addk_i32 s32, 0x4080 +; GFX942-NEXT: s_add_i32 s0, s33, 64 +; GFX942-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NEXT: s_and_b64 s[0:1], 0, exec +; GFX942-NEXT: s_addc_u32 s0, s33, 0x4040 +; GFX942-NEXT: s_bitcmp1_b32 s0, 0 +; GFX942-NEXT: s_bitset0_b32 s0, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use alloca0 v0 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s59, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s59, scc +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s32, s33 +; GFX942-NEXT: s_mov_b32 s33, s2 +; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0) @@ -679,59 +443,30 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset() ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: ; GFX10_1: ; %bb.0: ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v0, s59, 0 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32 +; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1 -; GFX10_1-NEXT: v_readfirstlane_b32 s59, v1 +; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 +; GFX10_1-NEXT: v_readfirstlane_b32 s59, v0 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: ; GFX10_3: ; %bb.0: ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v0, s59, 0 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32 +; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1 -; GFX10_3-NEXT: v_readfirstlane_b32 s59, v1 +; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 +; GFX10_3-NEXT: v_readfirstlane_b32 s59, v0 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4040 -; GFX11-NEXT: scratch_store_b32 off, v0, s1 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v0, s59, 0 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo ; GFX11-NEXT: s_addc_u32 s0, s32, 64 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -741,12 +476,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset() ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4040 -; GFX11-NEXT: scratch_load_b32 v0, off, s1 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: @@ -756,96 +485,51 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset() ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v0, s59, 0 -; GFX12-NEXT: s_mov_b32 s59, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo +; GFX12-NEXT: s_mov_b32 s59, s32 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_readlane_b32 s59, v0, 0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v0, s59, 0 -; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 +; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX8-NEXT: s_mov_b32 s59, 64 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s59, v1 -; GFX8-NEXT: v_readfirstlane_b32 s59, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s59, v0 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec +; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v0, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v0, s59, 0 -; GFX900-NEXT: v_lshrrev_b32_e64 v1, 6, s32 -; GFX900-NEXT: v_add_u32_e32 v1, 64, v1 -; GFX900-NEXT: v_readfirstlane_b32 s59, v1 +; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec +; GFX900-NEXT: v_readfirstlane_b32 s59, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v0, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x4040 -; GFX940-NEXT: scratch_store_dword off, v0, s2 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s59, 0 -; GFX940-NEXT: s_and_b64 s[0:1], 0, exec -; GFX940-NEXT: s_addc_u32 s0, s32, 64 -; GFX940-NEXT: s_bitcmp1_b32 s0, 0 -; GFX940-NEXT: s_bitset0_b32 s0, 0 -; GFX940-NEXT: s_mov_b32 s59, s0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s59, scc -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s59, v0, 0 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x4040 -; GFX940-NEXT: scratch_load_dword v0, off, s2 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_and_b64 s[0:1], 0, exec +; GFX942-NEXT: s_addc_u32 s0, s32, 64 +; GFX942-NEXT: s_bitcmp1_b32 s0, 0 +; GFX942-NEXT: s_bitset0_b32 s0, 0 +; GFX942-NEXT: s_mov_b32 s59, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s59, scc +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca0, i32 0) ret void @@ -855,67 +539,32 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0 ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: ; GFX10_1: ; %bb.0: ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v0, s59, 0 ; GFX10_1-NEXT: s_lshr_b32 s59, s32, 5 ; GFX10_1-NEXT: s_add_i32 s59, s59, 64 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: ; GFX10_3: ; %bb.0: ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v0, s59, 0 ; GFX10_3-NEXT: s_lshr_b32 s59, s32, 5 ; GFX10_3-NEXT: s_add_i32 s59, s59, 64 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80800 -; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4040 -; GFX11-NEXT: scratch_store_b32 off, v0, s1 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v0, s59, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s59, s0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x4040 -; GFX11-NEXT: scratch_load_b32 v0, off, s1 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: @@ -925,87 +574,42 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v0, s59, 0 ; GFX12-NEXT: s_mov_b32 s59, s32 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_readlane_b32 s59, v0, 0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v0, s59, 0 ; GFX8-NEXT: s_lshr_b32 s59, s32, 6 ; GFX8-NEXT: s_add_i32 s59, s59, 64 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59 ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v0, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v0, s59, 0 ; GFX900-NEXT: s_lshr_b32 s59, s32, 6 ; GFX900-NEXT: s_add_i32 s59, s59, 64 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59 ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v0, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x101000 -; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x4040 -; GFX940-NEXT: scratch_store_dword off, v0, s2 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s59, 0 -; GFX940-NEXT: s_add_i32 s0, s32, 64 -; GFX940-NEXT: s_mov_b32 s59, s0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s59 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s59, v0, 0 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x4040 -; GFX940-NEXT: scratch_load_dword v0, off, s2 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_add_i32 s0, s32, 64 +; GFX942-NEXT: s_mov_b32 s59, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s59 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca0) ret void @@ -1017,29 +621,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_1-NEXT: s_mov_b32 s5, s33 ; GFX10_1-NEXT: s_mov_b32 s33, s32 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000 -; GFX10_1-NEXT: v_writelane_b32 v0, s59, 0 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s33 +; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10_1-NEXT: s_add_i32 s32, s32, 0x80800 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: s_mov_b32 s32, s33 -; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1 -; GFX10_1-NEXT: v_readfirstlane_b32 s59, v1 +; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 +; GFX10_1-NEXT: v_readfirstlane_b32 s59, v0 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_1-NEXT: s_mov_b32 s33, s5 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp: @@ -1047,27 +638,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_3-NEXT: s_mov_b32 s5, s33 ; GFX10_3-NEXT: s_mov_b32 s33, s32 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000 -; GFX10_3-NEXT: v_writelane_b32 v0, s59, 0 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s33 +; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10_3-NEXT: s_add_i32 s32, s32, 0x80800 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: s_mov_b32 s32, s33 -; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1 -; GFX10_3-NEXT: v_readfirstlane_b32 s59, v1 +; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 +; GFX10_3-NEXT: v_readfirstlane_b32 s59, v0 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_3-NEXT: s_mov_b32 s33, s5 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp: @@ -1075,12 +655,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s2, s33, 0x4040 -; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0x4080 -; GFX11-NEXT: v_writelane_b32 v0, s59, 0 +; GFX11-NEXT: s_addk_i32 s32, 0x4040 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo ; GFX11-NEXT: s_addc_u32 s0, s33, 64 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -1091,13 +666,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s2, s33, 0x4040 -; GFX11-NEXT: scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s33, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp: @@ -1109,26 +678,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, s33 ; GFX12-NEXT: s_mov_b32 s33, s32 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_addk_co_i32 s32, 0x4040 -; GFX12-NEXT: v_writelane_b32 v0, s59, 0 -; GFX12-NEXT: s_mov_b32 s59, s33 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b32 s59, s33 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_readlane_b32 s59, v0, 0 ; GFX12-NEXT: s_mov_b32 s32, s33 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_mov_b32 s33, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -1137,28 +695,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s7, s33, 0x101000 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_add_i32 s32, s32, 0x102000 -; GFX8-NEXT: v_writelane_b32 v0, s59, 0 -; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX8-NEXT: s_mov_b32 s59, 64 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s59, v1 -; GFX8-NEXT: v_readfirstlane_b32 s59, v1 +; GFX8-NEXT: s_add_i32 s32, s32, 0x101000 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s59, v0 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec +; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v0, 0 ; GFX8-NEXT: s_mov_b32 s32, s33 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s7, s33, 0x101000 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_mov_b32 s33, s6 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp: @@ -1166,57 +713,35 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_mov_b32 s6, s33 ; GFX900-NEXT: s_mov_b32 s33, s32 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s7, s33, 0x101000 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_add_i32 s32, s32, 0x102000 -; GFX900-NEXT: v_writelane_b32 v0, s59, 0 -; GFX900-NEXT: v_lshrrev_b32_e64 v1, 6, s33 -; GFX900-NEXT: v_add_u32_e32 v1, 64, v1 -; GFX900-NEXT: v_readfirstlane_b32 s59, v1 +; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; GFX900-NEXT: s_add_i32 s32, s32, 0x101000 +; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec +; GFX900-NEXT: v_readfirstlane_b32 s59, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v0, 0 ; GFX900-NEXT: s_mov_b32 s32, s33 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s7, s33, 0x101000 -; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: s_mov_b32 s33, s6 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b32 s2, s33 -; GFX940-NEXT: s_mov_b32 s33, s32 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s3, s33, 0x4040 -; GFX940-NEXT: scratch_store_dword off, v0, s3 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_addk_i32 s32, 0x4080 -; GFX940-NEXT: v_writelane_b32 v0, s59, 0 -; GFX940-NEXT: s_and_b64 s[0:1], 0, exec -; GFX940-NEXT: s_addc_u32 s0, s33, 64 -; GFX940-NEXT: s_bitcmp1_b32 s0, 0 -; GFX940-NEXT: s_bitset0_b32 s0, 0 -; GFX940-NEXT: s_mov_b32 s59, s0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s59, scc -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s59, v0, 0 -; GFX940-NEXT: s_mov_b32 s32, s33 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s3, s33, 0x4040 -; GFX940-NEXT: scratch_load_dword v0, off, s3 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_mov_b32 s33, s2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b32 s2, s33 +; GFX942-NEXT: s_mov_b32 s33, s32 +; GFX942-NEXT: s_addk_i32 s32, 0x4040 +; GFX942-NEXT: s_and_b64 s[0:1], 0, exec +; GFX942-NEXT: s_addc_u32 s0, s33, 64 +; GFX942-NEXT: s_bitcmp1_b32 s0, 0 +; GFX942-NEXT: s_bitset0_b32 s0, 0 +; GFX942-NEXT: s_mov_b32 s59, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s59, scc +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s32, s33 +; GFX942-NEXT: s_mov_b32 s33, s2 +; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca0, i32 0) ret void @@ -1228,27 +753,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_1-NEXT: s_mov_b32 s4, s33 ; GFX10_1-NEXT: s_mov_b32 s33, s32 -; GFX10_1-NEXT: s_xor_saveexec_b32 s5, -1 -; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s5 -; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000 -; GFX10_1-NEXT: v_writelane_b32 v0, s59, 0 +; GFX10_1-NEXT: s_add_i32 s32, s32, 0x80800 ; GFX10_1-NEXT: s_lshr_b32 s59, s33, 5 ; GFX10_1-NEXT: s_mov_b32 s32, s33 ; GFX10_1-NEXT: s_add_i32 s59, s59, 64 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s5, -1 -; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s5 ; GFX10_1-NEXT: s_mov_b32 s33, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp: @@ -1256,25 +768,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_3-NEXT: s_mov_b32 s4, s33 ; GFX10_3-NEXT: s_mov_b32 s33, s32 -; GFX10_3-NEXT: s_xor_saveexec_b32 s5, -1 -; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s5 -; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000 -; GFX10_3-NEXT: v_writelane_b32 v0, s59, 0 +; GFX10_3-NEXT: s_add_i32 s32, s32, 0x80800 ; GFX10_3-NEXT: s_lshr_b32 s59, s33, 5 ; GFX10_3-NEXT: s_mov_b32 s32, s33 ; GFX10_3-NEXT: s_add_i32 s59, s59, 64 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v0, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s5, -1 -; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800 -; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s5 ; GFX10_3-NEXT: s_mov_b32 s33, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp: @@ -1282,25 +783,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 -; GFX11-NEXT: s_add_i32 s2, s33, 0x4040 -; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0x4080 -; GFX11-NEXT: v_writelane_b32 v0, s59, 0 +; GFX11-NEXT: s_addk_i32 s32, 0x4040 ; GFX11-NEXT: s_add_i32 s1, s33, 64 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_mov_b32 s59, s1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 -; GFX11-NEXT: s_add_i32 s2, s33, 0x4040 -; GFX11-NEXT: scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_mov_b32 s33, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp: @@ -1312,25 +802,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s0, s33 ; GFX12-NEXT: s_mov_b32 s33, s32 -; GFX12-NEXT: s_xor_saveexec_b32 s1, -1 -; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_addk_co_i32 s32, 0x4040 -; GFX12-NEXT: v_writelane_b32 v0, s59, 0 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 s59, s33 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_readlane_b32 s59, v0, 0 ; GFX12-NEXT: s_mov_b32 s32, s33 -; GFX12-NEXT: s_xor_saveexec_b32 s1, -1 -; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_mov_b32 s33, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -1339,25 +818,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX8-NEXT: s_add_i32 s5, s33, 0x101000 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: s_add_i32 s32, s32, 0x102000 -; GFX8-NEXT: v_writelane_b32 v0, s59, 0 +; GFX8-NEXT: s_add_i32 s32, s32, 0x101000 ; GFX8-NEXT: s_lshr_b32 s59, s33, 6 ; GFX8-NEXT: s_add_i32 s59, s59, 64 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59 ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v0, 0 ; GFX8-NEXT: s_mov_b32 s32, s33 -; GFX8-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX8-NEXT: s_add_i32 s5, s33, 0x101000 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b32 s33, s4 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp: @@ -1365,52 +833,30 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_mov_b32 s4, s33 ; GFX900-NEXT: s_mov_b32 s33, s32 -; GFX900-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX900-NEXT: s_add_i32 s5, s33, 0x101000 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[6:7] -; GFX900-NEXT: s_add_i32 s32, s32, 0x102000 -; GFX900-NEXT: v_writelane_b32 v0, s59, 0 +; GFX900-NEXT: s_add_i32 s32, s32, 0x101000 ; GFX900-NEXT: s_lshr_b32 s59, s33, 6 ; GFX900-NEXT: s_add_i32 s59, s59, 64 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59 ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v0, 0 ; GFX900-NEXT: s_mov_b32 s32, s33 -; GFX900-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX900-NEXT: s_add_i32 s5, s33, 0x101000 -; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[6:7] ; GFX900-NEXT: s_mov_b32 s33, s4 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b32 s0, s33 -; GFX940-NEXT: s_mov_b32 s33, s32 -; GFX940-NEXT: s_xor_saveexec_b64 s[2:3], -1 -; GFX940-NEXT: s_add_i32 s1, s33, 0x4040 -; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_addk_i32 s32, 0x4080 -; GFX940-NEXT: v_writelane_b32 v0, s59, 0 -; GFX940-NEXT: s_add_i32 s1, s33, 64 -; GFX940-NEXT: s_mov_b32 s59, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s59 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s59, v0, 0 -; GFX940-NEXT: s_mov_b32 s32, s33 -; GFX940-NEXT: s_xor_saveexec_b64 s[2:3], -1 -; GFX940-NEXT: s_add_i32 s1, s33, 0x4040 -; GFX940-NEXT: scratch_load_dword v0, off, s1 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b32 s33, s0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b32 s0, s33 +; GFX942-NEXT: s_mov_b32 s33, s32 +; GFX942-NEXT: s_addk_i32 s32, 0x4040 +; GFX942-NEXT: s_add_i32 s1, s33, 64 +; GFX942-NEXT: s_mov_b32 s59, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s59 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s32, s33 +; GFX942-NEXT: s_mov_b32 s33, s0 +; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca0) ret void @@ -1420,12 +866,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: ; GFX10_1: ; %bb.0: ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5 ; GFX10_1-NEXT: s_add_i32 s59, s4, 0x442c @@ -1437,23 +877,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: ; GFX10_3: ; %bb.0: ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5 ; GFX10_3-NEXT: s_add_i32 s59, s4, 0x442c @@ -1465,22 +893,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX11-NEXT: scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v1, s59, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 ; GFX11-NEXT: s_add_i32 s59, s32, 0x442c ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -1491,12 +908,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v1, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: @@ -1506,11 +917,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:32768 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v1, s59, 0 ; GFX12-NEXT: s_add_co_i32 s59, s32, 0x43ec ; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo @@ -1520,23 +926,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v1, 0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:32768 ; 4-byte Folded Reload -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v1, s59, 0 ; GFX8-NEXT: s_lshr_b32 s4, s32, 6 ; GFX8-NEXT: s_add_i32 s59, s4, 0x442c ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 @@ -1548,22 +943,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v1, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v1, s59, 0 ; GFX900-NEXT: s_lshr_b32 s4, s32, 6 ; GFX900-NEXT: s_add_i32 s59, s4, 0x442c ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 @@ -1575,39 +959,22 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v1, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX940-NEXT: scratch_store_dword off, v1, s2 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v1, s59, 0 -; GFX940-NEXT: s_add_i32 s59, s32, 0x442c -; GFX940-NEXT: s_add_i32 s0, s32, 64 -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use alloca0 v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_and_b64 s[0:1], 0, exec -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s59, scc -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s59, v1, 0 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX940-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_add_i32 s59, s32, 0x442c +; GFX942-NEXT: s_add_i32 s0, s32, 64 +; GFX942-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use alloca0 v0 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_and_b64 s[0:1], 0, exec +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s59, scc +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca [4096 x i32], align 4, addrspace(5) %alloca1.offset = getelementptr [4096 x i32], ptr addrspace(5) %alloca1, i32 0, i32 251 @@ -1620,12 +987,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: ; GFX10_1: ; %bb.0: ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_lshl_b32 s4, s16, 2 ; GFX10_1-NEXT: s_lshr_b32 s59, s32, 5 @@ -1639,23 +1000,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: ; GFX10_3: ; %bb.0: ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_lshl_b32 s4, s16, 2 ; GFX10_3-NEXT: s_lshr_b32 s59, s32, 5 @@ -1669,22 +1018,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v1, 0 -; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload -; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 -; GFX11-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX11-NEXT: scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v1, s59, 0 ; GFX11-NEXT: s_add_i32 s1, s32, 64 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, s1 @@ -1697,12 +1035,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v1, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: @@ -1712,11 +1044,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_xor_saveexec_b32 s1, -1 -; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:32768 ; 4-byte Folded Spill -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_writelane_b32 v1, s59, 0 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1730,23 +1057,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v1, 0 -; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:32768 ; 4-byte Folded Reload -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v1, s59, 0 ; GFX8-NEXT: s_lshl_b32 s4, s16, 2 ; GFX8-NEXT: s_lshr_b32 s59, s32, 6 ; GFX8-NEXT: s_add_i32 s59, s59, s4 @@ -1760,22 +1076,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v1, 0 -; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v1, s59, 0 ; GFX900-NEXT: s_lshl_b32 s4, s16, 2 ; GFX900-NEXT: s_lshr_b32 s59, s32, 6 ; GFX900-NEXT: s_add_i32 s59, s59, s4 @@ -1789,41 +1094,24 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v1, 0 -; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[2:3], -1 -; GFX940-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX940-NEXT: scratch_store_dword off, v1, s1 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: v_writelane_b32 v1, s59, 0 -; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: s_add_i32 s59, s32, s0 -; GFX940-NEXT: s_addk_i32 s59, 0x4040 -; GFX940-NEXT: s_add_i32 s0, s32, 64 -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use alloca0 v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_and_b64 s[0:1], 0, exec -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s59, scc -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s59, v1, 0 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX940-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: s_add_i32 s59, s32, s0 +; GFX942-NEXT: s_addk_i32 s59, 0x4040 +; GFX942-NEXT: s_add_i32 s0, s32, 64 +; GFX942-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use alloca0 v0 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_and_b64 s[0:1], 0, exec +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s59, scc +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca [4096 x i32], align 4, addrspace(5) %alloca1.offset = getelementptr [4096 x i32], ptr addrspace(5) %alloca1, i32 0, i32 %soffset diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll index 2a8528148dd94..cc33efca157b9 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX900 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10_1 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10_3 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s @@ -44,28 +44,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX7-NEXT: v_writelane_b32 v23, s37, 4 ; GFX7-NEXT: v_writelane_b32 v23, s38, 5 ; GFX7-NEXT: v_writelane_b32 v23, s39, 6 -; GFX7-NEXT: v_writelane_b32 v23, s40, 7 -; GFX7-NEXT: v_writelane_b32 v23, s41, 8 -; GFX7-NEXT: v_writelane_b32 v23, s42, 9 -; GFX7-NEXT: v_writelane_b32 v23, s43, 10 -; GFX7-NEXT: v_writelane_b32 v23, s44, 11 -; GFX7-NEXT: v_writelane_b32 v23, s45, 12 -; GFX7-NEXT: v_writelane_b32 v23, s46, 13 -; GFX7-NEXT: v_writelane_b32 v23, s47, 14 -; GFX7-NEXT: v_writelane_b32 v23, s48, 15 -; GFX7-NEXT: v_writelane_b32 v23, s49, 16 -; GFX7-NEXT: v_writelane_b32 v23, s50, 17 -; GFX7-NEXT: v_writelane_b32 v23, s51, 18 -; GFX7-NEXT: v_writelane_b32 v23, s52, 19 -; GFX7-NEXT: v_writelane_b32 v23, s53, 20 -; GFX7-NEXT: v_writelane_b32 v23, s54, 21 -; GFX7-NEXT: v_writelane_b32 v23, s55, 22 -; GFX7-NEXT: v_writelane_b32 v23, s56, 23 -; GFX7-NEXT: v_writelane_b32 v23, s57, 24 -; GFX7-NEXT: v_writelane_b32 v23, s58, 25 -; GFX7-NEXT: v_writelane_b32 v23, s59, 26 -; GFX7-NEXT: v_writelane_b32 v23, s30, 27 -; GFX7-NEXT: v_writelane_b32 v23, s31, 28 +; GFX7-NEXT: v_writelane_b32 v23, s48, 7 +; GFX7-NEXT: v_writelane_b32 v23, s49, 8 +; GFX7-NEXT: v_writelane_b32 v23, s50, 9 +; GFX7-NEXT: v_writelane_b32 v23, s51, 10 +; GFX7-NEXT: v_writelane_b32 v23, s52, 11 +; GFX7-NEXT: v_writelane_b32 v23, s53, 12 +; GFX7-NEXT: v_writelane_b32 v23, s54, 13 +; GFX7-NEXT: v_writelane_b32 v23, s55, 14 +; GFX7-NEXT: v_writelane_b32 v23, s30, 15 +; GFX7-NEXT: v_writelane_b32 v23, s31, 16 ; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 64, v0 ; GFX7-NEXT: s_and_b64 s[4:5], 0, exec @@ -85,28 +73,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX7-NEXT: ;;#ASMEND -; GFX7-NEXT: v_readlane_b32 s30, v23, 27 -; GFX7-NEXT: v_readlane_b32 s31, v23, 28 -; GFX7-NEXT: v_readlane_b32 s59, v23, 26 -; GFX7-NEXT: v_readlane_b32 s58, v23, 25 -; GFX7-NEXT: v_readlane_b32 s57, v23, 24 -; GFX7-NEXT: v_readlane_b32 s56, v23, 23 -; GFX7-NEXT: v_readlane_b32 s55, v23, 22 -; GFX7-NEXT: v_readlane_b32 s54, v23, 21 -; GFX7-NEXT: v_readlane_b32 s53, v23, 20 -; GFX7-NEXT: v_readlane_b32 s52, v23, 19 -; GFX7-NEXT: v_readlane_b32 s51, v23, 18 -; GFX7-NEXT: v_readlane_b32 s50, v23, 17 -; GFX7-NEXT: v_readlane_b32 s49, v23, 16 -; GFX7-NEXT: v_readlane_b32 s48, v23, 15 -; GFX7-NEXT: v_readlane_b32 s47, v23, 14 -; GFX7-NEXT: v_readlane_b32 s46, v23, 13 -; GFX7-NEXT: v_readlane_b32 s45, v23, 12 -; GFX7-NEXT: v_readlane_b32 s44, v23, 11 -; GFX7-NEXT: v_readlane_b32 s43, v23, 10 -; GFX7-NEXT: v_readlane_b32 s42, v23, 9 -; GFX7-NEXT: v_readlane_b32 s41, v23, 8 -; GFX7-NEXT: v_readlane_b32 s40, v23, 7 +; GFX7-NEXT: v_readlane_b32 s30, v23, 15 +; GFX7-NEXT: v_readlane_b32 s31, v23, 16 +; GFX7-NEXT: v_readlane_b32 s55, v23, 14 +; GFX7-NEXT: v_readlane_b32 s54, v23, 13 +; GFX7-NEXT: v_readlane_b32 s53, v23, 12 +; GFX7-NEXT: v_readlane_b32 s52, v23, 11 +; GFX7-NEXT: v_readlane_b32 s51, v23, 10 +; GFX7-NEXT: v_readlane_b32 s50, v23, 9 +; GFX7-NEXT: v_readlane_b32 s49, v23, 8 +; GFX7-NEXT: v_readlane_b32 s48, v23, 7 ; GFX7-NEXT: v_readlane_b32 s39, v23, 6 ; GFX7-NEXT: v_readlane_b32 s38, v23, 5 ; GFX7-NEXT: v_readlane_b32 s37, v23, 4 @@ -135,28 +111,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX8-NEXT: v_writelane_b32 v23, s37, 4 ; GFX8-NEXT: v_writelane_b32 v23, s38, 5 ; GFX8-NEXT: v_writelane_b32 v23, s39, 6 -; GFX8-NEXT: v_writelane_b32 v23, s40, 7 -; GFX8-NEXT: v_writelane_b32 v23, s41, 8 -; GFX8-NEXT: v_writelane_b32 v23, s42, 9 -; GFX8-NEXT: v_writelane_b32 v23, s43, 10 -; GFX8-NEXT: v_writelane_b32 v23, s44, 11 -; GFX8-NEXT: v_writelane_b32 v23, s45, 12 -; GFX8-NEXT: v_writelane_b32 v23, s46, 13 -; GFX8-NEXT: v_writelane_b32 v23, s47, 14 -; GFX8-NEXT: v_writelane_b32 v23, s48, 15 -; GFX8-NEXT: v_writelane_b32 v23, s49, 16 -; GFX8-NEXT: v_writelane_b32 v23, s50, 17 -; GFX8-NEXT: v_writelane_b32 v23, s51, 18 -; GFX8-NEXT: v_writelane_b32 v23, s52, 19 -; GFX8-NEXT: v_writelane_b32 v23, s53, 20 -; GFX8-NEXT: v_writelane_b32 v23, s54, 21 -; GFX8-NEXT: v_writelane_b32 v23, s55, 22 -; GFX8-NEXT: v_writelane_b32 v23, s56, 23 -; GFX8-NEXT: v_writelane_b32 v23, s57, 24 -; GFX8-NEXT: v_writelane_b32 v23, s58, 25 -; GFX8-NEXT: v_writelane_b32 v23, s59, 26 -; GFX8-NEXT: v_writelane_b32 v23, s30, 27 -; GFX8-NEXT: v_writelane_b32 v23, s31, 28 +; GFX8-NEXT: v_writelane_b32 v23, s48, 7 +; GFX8-NEXT: v_writelane_b32 v23, s49, 8 +; GFX8-NEXT: v_writelane_b32 v23, s50, 9 +; GFX8-NEXT: v_writelane_b32 v23, s51, 10 +; GFX8-NEXT: v_writelane_b32 v23, s52, 11 +; GFX8-NEXT: v_writelane_b32 v23, s53, 12 +; GFX8-NEXT: v_writelane_b32 v23, s54, 13 +; GFX8-NEXT: v_writelane_b32 v23, s55, 14 +; GFX8-NEXT: v_writelane_b32 v23, s30, 15 +; GFX8-NEXT: v_writelane_b32 v23, s31, 16 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec @@ -176,28 +140,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s30, v23, 27 -; GFX8-NEXT: v_readlane_b32 s31, v23, 28 -; GFX8-NEXT: v_readlane_b32 s59, v23, 26 -; GFX8-NEXT: v_readlane_b32 s58, v23, 25 -; GFX8-NEXT: v_readlane_b32 s57, v23, 24 -; GFX8-NEXT: v_readlane_b32 s56, v23, 23 -; GFX8-NEXT: v_readlane_b32 s55, v23, 22 -; GFX8-NEXT: v_readlane_b32 s54, v23, 21 -; GFX8-NEXT: v_readlane_b32 s53, v23, 20 -; GFX8-NEXT: v_readlane_b32 s52, v23, 19 -; GFX8-NEXT: v_readlane_b32 s51, v23, 18 -; GFX8-NEXT: v_readlane_b32 s50, v23, 17 -; GFX8-NEXT: v_readlane_b32 s49, v23, 16 -; GFX8-NEXT: v_readlane_b32 s48, v23, 15 -; GFX8-NEXT: v_readlane_b32 s47, v23, 14 -; GFX8-NEXT: v_readlane_b32 s46, v23, 13 -; GFX8-NEXT: v_readlane_b32 s45, v23, 12 -; GFX8-NEXT: v_readlane_b32 s44, v23, 11 -; GFX8-NEXT: v_readlane_b32 s43, v23, 10 -; GFX8-NEXT: v_readlane_b32 s42, v23, 9 -; GFX8-NEXT: v_readlane_b32 s41, v23, 8 -; GFX8-NEXT: v_readlane_b32 s40, v23, 7 +; GFX8-NEXT: v_readlane_b32 s30, v23, 15 +; GFX8-NEXT: v_readlane_b32 s31, v23, 16 +; GFX8-NEXT: v_readlane_b32 s55, v23, 14 +; GFX8-NEXT: v_readlane_b32 s54, v23, 13 +; GFX8-NEXT: v_readlane_b32 s53, v23, 12 +; GFX8-NEXT: v_readlane_b32 s52, v23, 11 +; GFX8-NEXT: v_readlane_b32 s51, v23, 10 +; GFX8-NEXT: v_readlane_b32 s50, v23, 9 +; GFX8-NEXT: v_readlane_b32 s49, v23, 8 +; GFX8-NEXT: v_readlane_b32 s48, v23, 7 ; GFX8-NEXT: v_readlane_b32 s39, v23, 6 ; GFX8-NEXT: v_readlane_b32 s38, v23, 5 ; GFX8-NEXT: v_readlane_b32 s37, v23, 4 @@ -226,28 +178,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX900-NEXT: v_writelane_b32 v23, s37, 4 ; GFX900-NEXT: v_writelane_b32 v23, s38, 5 ; GFX900-NEXT: v_writelane_b32 v23, s39, 6 -; GFX900-NEXT: v_writelane_b32 v23, s40, 7 -; GFX900-NEXT: v_writelane_b32 v23, s41, 8 -; GFX900-NEXT: v_writelane_b32 v23, s42, 9 -; GFX900-NEXT: v_writelane_b32 v23, s43, 10 -; GFX900-NEXT: v_writelane_b32 v23, s44, 11 -; GFX900-NEXT: v_writelane_b32 v23, s45, 12 -; GFX900-NEXT: v_writelane_b32 v23, s46, 13 -; GFX900-NEXT: v_writelane_b32 v23, s47, 14 -; GFX900-NEXT: v_writelane_b32 v23, s48, 15 -; GFX900-NEXT: v_writelane_b32 v23, s49, 16 -; GFX900-NEXT: v_writelane_b32 v23, s50, 17 -; GFX900-NEXT: v_writelane_b32 v23, s51, 18 -; GFX900-NEXT: v_writelane_b32 v23, s52, 19 -; GFX900-NEXT: v_writelane_b32 v23, s53, 20 -; GFX900-NEXT: v_writelane_b32 v23, s54, 21 -; GFX900-NEXT: v_writelane_b32 v23, s55, 22 -; GFX900-NEXT: v_writelane_b32 v23, s56, 23 -; GFX900-NEXT: v_writelane_b32 v23, s57, 24 -; GFX900-NEXT: v_writelane_b32 v23, s58, 25 -; GFX900-NEXT: v_writelane_b32 v23, s59, 26 -; GFX900-NEXT: v_writelane_b32 v23, s30, 27 -; GFX900-NEXT: v_writelane_b32 v23, s31, 28 +; GFX900-NEXT: v_writelane_b32 v23, s48, 7 +; GFX900-NEXT: v_writelane_b32 v23, s49, 8 +; GFX900-NEXT: v_writelane_b32 v23, s50, 9 +; GFX900-NEXT: v_writelane_b32 v23, s51, 10 +; GFX900-NEXT: v_writelane_b32 v23, s52, 11 +; GFX900-NEXT: v_writelane_b32 v23, s53, 12 +; GFX900-NEXT: v_writelane_b32 v23, s54, 13 +; GFX900-NEXT: v_writelane_b32 v23, s55, 14 +; GFX900-NEXT: v_writelane_b32 v23, s30, 15 +; GFX900-NEXT: v_writelane_b32 v23, s31, 16 ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec @@ -266,28 +206,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s30, v23, 27 -; GFX900-NEXT: v_readlane_b32 s31, v23, 28 -; GFX900-NEXT: v_readlane_b32 s59, v23, 26 -; GFX900-NEXT: v_readlane_b32 s58, v23, 25 -; GFX900-NEXT: v_readlane_b32 s57, v23, 24 -; GFX900-NEXT: v_readlane_b32 s56, v23, 23 -; GFX900-NEXT: v_readlane_b32 s55, v23, 22 -; GFX900-NEXT: v_readlane_b32 s54, v23, 21 -; GFX900-NEXT: v_readlane_b32 s53, v23, 20 -; GFX900-NEXT: v_readlane_b32 s52, v23, 19 -; GFX900-NEXT: v_readlane_b32 s51, v23, 18 -; GFX900-NEXT: v_readlane_b32 s50, v23, 17 -; GFX900-NEXT: v_readlane_b32 s49, v23, 16 -; GFX900-NEXT: v_readlane_b32 s48, v23, 15 -; GFX900-NEXT: v_readlane_b32 s47, v23, 14 -; GFX900-NEXT: v_readlane_b32 s46, v23, 13 -; GFX900-NEXT: v_readlane_b32 s45, v23, 12 -; GFX900-NEXT: v_readlane_b32 s44, v23, 11 -; GFX900-NEXT: v_readlane_b32 s43, v23, 10 -; GFX900-NEXT: v_readlane_b32 s42, v23, 9 -; GFX900-NEXT: v_readlane_b32 s41, v23, 8 -; GFX900-NEXT: v_readlane_b32 s40, v23, 7 +; GFX900-NEXT: v_readlane_b32 s30, v23, 15 +; GFX900-NEXT: v_readlane_b32 s31, v23, 16 +; GFX900-NEXT: v_readlane_b32 s55, v23, 14 +; GFX900-NEXT: v_readlane_b32 s54, v23, 13 +; GFX900-NEXT: v_readlane_b32 s53, v23, 12 +; GFX900-NEXT: v_readlane_b32 s52, v23, 11 +; GFX900-NEXT: v_readlane_b32 s51, v23, 10 +; GFX900-NEXT: v_readlane_b32 s50, v23, 9 +; GFX900-NEXT: v_readlane_b32 s49, v23, 8 +; GFX900-NEXT: v_readlane_b32 s48, v23, 7 ; GFX900-NEXT: v_readlane_b32 s39, v23, 6 ; GFX900-NEXT: v_readlane_b32 s38, v23, 5 ; GFX900-NEXT: v_readlane_b32 s37, v23, 4 @@ -302,98 +230,70 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x4044 -; GFX940-NEXT: scratch_store_dword off, v23, s2 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v23, s33, 0 -; GFX940-NEXT: v_writelane_b32 v23, s34, 1 -; GFX940-NEXT: v_writelane_b32 v23, s35, 2 -; GFX940-NEXT: v_writelane_b32 v23, s36, 3 -; GFX940-NEXT: v_writelane_b32 v23, s37, 4 -; GFX940-NEXT: v_writelane_b32 v23, s38, 5 -; GFX940-NEXT: v_writelane_b32 v23, s39, 6 -; GFX940-NEXT: v_writelane_b32 v23, s40, 7 -; GFX940-NEXT: v_writelane_b32 v23, s41, 8 -; GFX940-NEXT: v_writelane_b32 v23, s42, 9 -; GFX940-NEXT: v_writelane_b32 v23, s43, 10 -; GFX940-NEXT: v_writelane_b32 v23, s44, 11 -; GFX940-NEXT: v_writelane_b32 v23, s45, 12 -; GFX940-NEXT: v_writelane_b32 v23, s46, 13 -; GFX940-NEXT: v_writelane_b32 v23, s47, 14 -; GFX940-NEXT: v_writelane_b32 v23, s48, 15 -; GFX940-NEXT: v_writelane_b32 v23, s49, 16 -; GFX940-NEXT: v_writelane_b32 v23, s50, 17 -; GFX940-NEXT: v_writelane_b32 v23, s51, 18 -; GFX940-NEXT: v_writelane_b32 v23, s52, 19 -; GFX940-NEXT: v_writelane_b32 v23, s53, 20 -; GFX940-NEXT: v_writelane_b32 v23, s54, 21 -; GFX940-NEXT: v_writelane_b32 v23, s55, 22 -; GFX940-NEXT: v_writelane_b32 v23, s56, 23 -; GFX940-NEXT: v_writelane_b32 v23, s57, 24 -; GFX940-NEXT: v_writelane_b32 v23, s58, 25 -; GFX940-NEXT: v_writelane_b32 v23, s59, 26 -; GFX940-NEXT: v_writelane_b32 v23, s60, 27 -; GFX940-NEXT: v_writelane_b32 v23, s61, 28 -; GFX940-NEXT: v_writelane_b32 v23, s30, 29 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v23, s31, 30 -; GFX940-NEXT: s_add_i32 s0, s32, 64 -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: s_and_b64 s[60:61], 0, exec -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use alloca0 v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_addc_u32 s60, s32, 0x4040 -; GFX940-NEXT: s_bitcmp1_b32 s60, 0 -; GFX940-NEXT: s_bitset0_b32 s60, 0 -; GFX940-NEXT: s_mov_b32 s59, s60 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s30, v23, 29 -; GFX940-NEXT: v_readlane_b32 s31, v23, 30 -; GFX940-NEXT: v_readlane_b32 s61, v23, 28 -; GFX940-NEXT: v_readlane_b32 s60, v23, 27 -; GFX940-NEXT: v_readlane_b32 s59, v23, 26 -; GFX940-NEXT: v_readlane_b32 s58, v23, 25 -; GFX940-NEXT: v_readlane_b32 s57, v23, 24 -; GFX940-NEXT: v_readlane_b32 s56, v23, 23 -; GFX940-NEXT: v_readlane_b32 s55, v23, 22 -; GFX940-NEXT: v_readlane_b32 s54, v23, 21 -; GFX940-NEXT: v_readlane_b32 s53, v23, 20 -; GFX940-NEXT: v_readlane_b32 s52, v23, 19 -; GFX940-NEXT: v_readlane_b32 s51, v23, 18 -; GFX940-NEXT: v_readlane_b32 s50, v23, 17 -; GFX940-NEXT: v_readlane_b32 s49, v23, 16 -; GFX940-NEXT: v_readlane_b32 s48, v23, 15 -; GFX940-NEXT: v_readlane_b32 s47, v23, 14 -; GFX940-NEXT: v_readlane_b32 s46, v23, 13 -; GFX940-NEXT: v_readlane_b32 s45, v23, 12 -; GFX940-NEXT: v_readlane_b32 s44, v23, 11 -; GFX940-NEXT: v_readlane_b32 s43, v23, 10 -; GFX940-NEXT: v_readlane_b32 s42, v23, 9 -; GFX940-NEXT: v_readlane_b32 s41, v23, 8 -; GFX940-NEXT: v_readlane_b32 s40, v23, 7 -; GFX940-NEXT: v_readlane_b32 s39, v23, 6 -; GFX940-NEXT: v_readlane_b32 s38, v23, 5 -; GFX940-NEXT: v_readlane_b32 s37, v23, 4 -; GFX940-NEXT: v_readlane_b32 s36, v23, 3 -; GFX940-NEXT: v_readlane_b32 s35, v23, 2 -; GFX940-NEXT: v_readlane_b32 s34, v23, 1 -; GFX940-NEXT: v_readlane_b32 s33, v23, 0 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x4044 -; GFX940-NEXT: scratch_load_dword v23, off, s2 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: s_add_i32 s2, s32, 0x4044 +; GFX942-NEXT: scratch_store_dword off, v23, s2 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v23, s33, 0 +; GFX942-NEXT: v_writelane_b32 v23, s34, 1 +; GFX942-NEXT: v_writelane_b32 v23, s35, 2 +; GFX942-NEXT: v_writelane_b32 v23, s36, 3 +; GFX942-NEXT: v_writelane_b32 v23, s37, 4 +; GFX942-NEXT: v_writelane_b32 v23, s38, 5 +; GFX942-NEXT: v_writelane_b32 v23, s39, 6 +; GFX942-NEXT: v_writelane_b32 v23, s48, 7 +; GFX942-NEXT: v_writelane_b32 v23, s49, 8 +; GFX942-NEXT: v_writelane_b32 v23, s50, 9 +; GFX942-NEXT: v_writelane_b32 v23, s51, 10 +; GFX942-NEXT: v_writelane_b32 v23, s52, 11 +; GFX942-NEXT: v_writelane_b32 v23, s53, 12 +; GFX942-NEXT: v_writelane_b32 v23, s54, 13 +; GFX942-NEXT: v_writelane_b32 v23, s55, 14 +; GFX942-NEXT: v_writelane_b32 v23, s30, 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v23, s31, 16 +; GFX942-NEXT: s_add_i32 s0, s32, 64 +; GFX942-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NEXT: s_and_b64 s[60:61], 0, exec +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use alloca0 v0 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_addc_u32 s60, s32, 0x4040 +; GFX942-NEXT: s_bitcmp1_b32 s60, 0 +; GFX942-NEXT: s_bitset0_b32 s60, 0 +; GFX942-NEXT: s_mov_b32 s59, s60 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s30, v23, 15 +; GFX942-NEXT: v_readlane_b32 s31, v23, 16 +; GFX942-NEXT: v_readlane_b32 s55, v23, 14 +; GFX942-NEXT: v_readlane_b32 s54, v23, 13 +; GFX942-NEXT: v_readlane_b32 s53, v23, 12 +; GFX942-NEXT: v_readlane_b32 s52, v23, 11 +; GFX942-NEXT: v_readlane_b32 s51, v23, 10 +; GFX942-NEXT: v_readlane_b32 s50, v23, 9 +; GFX942-NEXT: v_readlane_b32 s49, v23, 8 +; GFX942-NEXT: v_readlane_b32 s48, v23, 7 +; GFX942-NEXT: v_readlane_b32 s39, v23, 6 +; GFX942-NEXT: v_readlane_b32 s38, v23, 5 +; GFX942-NEXT: v_readlane_b32 s37, v23, 4 +; GFX942-NEXT: v_readlane_b32 s36, v23, 3 +; GFX942-NEXT: v_readlane_b32 s35, v23, 2 +; GFX942-NEXT: v_readlane_b32 s34, v23, 1 +; GFX942-NEXT: v_readlane_b32 s33, v23, 0 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: s_add_i32 s2, s32, 0x4044 +; GFX942-NEXT: scratch_load_dword v23, off, s2 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs: ; GFX10_1: ; %bb.0: @@ -410,28 +310,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX10_1-NEXT: v_writelane_b32 v23, s37, 4 ; GFX10_1-NEXT: v_writelane_b32 v23, s38, 5 ; GFX10_1-NEXT: v_writelane_b32 v23, s39, 6 -; GFX10_1-NEXT: v_writelane_b32 v23, s40, 7 -; GFX10_1-NEXT: v_writelane_b32 v23, s41, 8 -; GFX10_1-NEXT: v_writelane_b32 v23, s42, 9 -; GFX10_1-NEXT: v_writelane_b32 v23, s43, 10 -; GFX10_1-NEXT: v_writelane_b32 v23, s44, 11 -; GFX10_1-NEXT: v_writelane_b32 v23, s45, 12 -; GFX10_1-NEXT: v_writelane_b32 v23, s46, 13 -; GFX10_1-NEXT: v_writelane_b32 v23, s47, 14 -; GFX10_1-NEXT: v_writelane_b32 v23, s48, 15 -; GFX10_1-NEXT: v_writelane_b32 v23, s49, 16 -; GFX10_1-NEXT: v_writelane_b32 v23, s50, 17 -; GFX10_1-NEXT: v_writelane_b32 v23, s51, 18 -; GFX10_1-NEXT: v_writelane_b32 v23, s52, 19 -; GFX10_1-NEXT: v_writelane_b32 v23, s53, 20 -; GFX10_1-NEXT: v_writelane_b32 v23, s54, 21 -; GFX10_1-NEXT: v_writelane_b32 v23, s55, 22 -; GFX10_1-NEXT: v_writelane_b32 v23, s56, 23 -; GFX10_1-NEXT: v_writelane_b32 v23, s57, 24 -; GFX10_1-NEXT: v_writelane_b32 v23, s58, 25 -; GFX10_1-NEXT: v_writelane_b32 v23, s59, 26 -; GFX10_1-NEXT: v_writelane_b32 v23, s30, 27 -; GFX10_1-NEXT: v_writelane_b32 v23, s31, 28 +; GFX10_1-NEXT: v_writelane_b32 v23, s48, 7 +; GFX10_1-NEXT: v_writelane_b32 v23, s49, 8 +; GFX10_1-NEXT: v_writelane_b32 v23, s50, 9 +; GFX10_1-NEXT: v_writelane_b32 v23, s51, 10 +; GFX10_1-NEXT: v_writelane_b32 v23, s52, 11 +; GFX10_1-NEXT: v_writelane_b32 v23, s53, 12 +; GFX10_1-NEXT: v_writelane_b32 v23, s54, 13 +; GFX10_1-NEXT: v_writelane_b32 v23, s55, 14 +; GFX10_1-NEXT: v_writelane_b32 v23, s30, 15 +; GFX10_1-NEXT: v_writelane_b32 v23, s31, 16 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 @@ -447,28 +335,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s30, v23, 27 -; GFX10_1-NEXT: v_readlane_b32 s31, v23, 28 -; GFX10_1-NEXT: v_readlane_b32 s59, v23, 26 -; GFX10_1-NEXT: v_readlane_b32 s58, v23, 25 -; GFX10_1-NEXT: v_readlane_b32 s57, v23, 24 -; GFX10_1-NEXT: v_readlane_b32 s56, v23, 23 -; GFX10_1-NEXT: v_readlane_b32 s55, v23, 22 -; GFX10_1-NEXT: v_readlane_b32 s54, v23, 21 -; GFX10_1-NEXT: v_readlane_b32 s53, v23, 20 -; GFX10_1-NEXT: v_readlane_b32 s52, v23, 19 -; GFX10_1-NEXT: v_readlane_b32 s51, v23, 18 -; GFX10_1-NEXT: v_readlane_b32 s50, v23, 17 -; GFX10_1-NEXT: v_readlane_b32 s49, v23, 16 -; GFX10_1-NEXT: v_readlane_b32 s48, v23, 15 -; GFX10_1-NEXT: v_readlane_b32 s47, v23, 14 -; GFX10_1-NEXT: v_readlane_b32 s46, v23, 13 -; GFX10_1-NEXT: v_readlane_b32 s45, v23, 12 -; GFX10_1-NEXT: v_readlane_b32 s44, v23, 11 -; GFX10_1-NEXT: v_readlane_b32 s43, v23, 10 -; GFX10_1-NEXT: v_readlane_b32 s42, v23, 9 -; GFX10_1-NEXT: v_readlane_b32 s41, v23, 8 -; GFX10_1-NEXT: v_readlane_b32 s40, v23, 7 +; GFX10_1-NEXT: v_readlane_b32 s30, v23, 15 +; GFX10_1-NEXT: v_readlane_b32 s31, v23, 16 +; GFX10_1-NEXT: v_readlane_b32 s55, v23, 14 +; GFX10_1-NEXT: v_readlane_b32 s54, v23, 13 +; GFX10_1-NEXT: v_readlane_b32 s53, v23, 12 +; GFX10_1-NEXT: v_readlane_b32 s52, v23, 11 +; GFX10_1-NEXT: v_readlane_b32 s51, v23, 10 +; GFX10_1-NEXT: v_readlane_b32 s50, v23, 9 +; GFX10_1-NEXT: v_readlane_b32 s49, v23, 8 +; GFX10_1-NEXT: v_readlane_b32 s48, v23, 7 ; GFX10_1-NEXT: v_readlane_b32 s39, v23, 6 ; GFX10_1-NEXT: v_readlane_b32 s38, v23, 5 ; GFX10_1-NEXT: v_readlane_b32 s37, v23, 4 @@ -498,28 +374,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX10_3-NEXT: v_writelane_b32 v23, s37, 4 ; GFX10_3-NEXT: v_writelane_b32 v23, s38, 5 ; GFX10_3-NEXT: v_writelane_b32 v23, s39, 6 -; GFX10_3-NEXT: v_writelane_b32 v23, s40, 7 -; GFX10_3-NEXT: v_writelane_b32 v23, s41, 8 -; GFX10_3-NEXT: v_writelane_b32 v23, s42, 9 -; GFX10_3-NEXT: v_writelane_b32 v23, s43, 10 -; GFX10_3-NEXT: v_writelane_b32 v23, s44, 11 -; GFX10_3-NEXT: v_writelane_b32 v23, s45, 12 -; GFX10_3-NEXT: v_writelane_b32 v23, s46, 13 -; GFX10_3-NEXT: v_writelane_b32 v23, s47, 14 -; GFX10_3-NEXT: v_writelane_b32 v23, s48, 15 -; GFX10_3-NEXT: v_writelane_b32 v23, s49, 16 -; GFX10_3-NEXT: v_writelane_b32 v23, s50, 17 -; GFX10_3-NEXT: v_writelane_b32 v23, s51, 18 -; GFX10_3-NEXT: v_writelane_b32 v23, s52, 19 -; GFX10_3-NEXT: v_writelane_b32 v23, s53, 20 -; GFX10_3-NEXT: v_writelane_b32 v23, s54, 21 -; GFX10_3-NEXT: v_writelane_b32 v23, s55, 22 -; GFX10_3-NEXT: v_writelane_b32 v23, s56, 23 -; GFX10_3-NEXT: v_writelane_b32 v23, s57, 24 -; GFX10_3-NEXT: v_writelane_b32 v23, s58, 25 -; GFX10_3-NEXT: v_writelane_b32 v23, s59, 26 -; GFX10_3-NEXT: v_writelane_b32 v23, s30, 27 -; GFX10_3-NEXT: v_writelane_b32 v23, s31, 28 +; GFX10_3-NEXT: v_writelane_b32 v23, s48, 7 +; GFX10_3-NEXT: v_writelane_b32 v23, s49, 8 +; GFX10_3-NEXT: v_writelane_b32 v23, s50, 9 +; GFX10_3-NEXT: v_writelane_b32 v23, s51, 10 +; GFX10_3-NEXT: v_writelane_b32 v23, s52, 11 +; GFX10_3-NEXT: v_writelane_b32 v23, s53, 12 +; GFX10_3-NEXT: v_writelane_b32 v23, s54, 13 +; GFX10_3-NEXT: v_writelane_b32 v23, s55, 14 +; GFX10_3-NEXT: v_writelane_b32 v23, s30, 15 +; GFX10_3-NEXT: v_writelane_b32 v23, s31, 16 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 @@ -535,28 +399,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s30, v23, 27 -; GFX10_3-NEXT: v_readlane_b32 s31, v23, 28 -; GFX10_3-NEXT: v_readlane_b32 s59, v23, 26 -; GFX10_3-NEXT: v_readlane_b32 s58, v23, 25 -; GFX10_3-NEXT: v_readlane_b32 s57, v23, 24 -; GFX10_3-NEXT: v_readlane_b32 s56, v23, 23 -; GFX10_3-NEXT: v_readlane_b32 s55, v23, 22 -; GFX10_3-NEXT: v_readlane_b32 s54, v23, 21 -; GFX10_3-NEXT: v_readlane_b32 s53, v23, 20 -; GFX10_3-NEXT: v_readlane_b32 s52, v23, 19 -; GFX10_3-NEXT: v_readlane_b32 s51, v23, 18 -; GFX10_3-NEXT: v_readlane_b32 s50, v23, 17 -; GFX10_3-NEXT: v_readlane_b32 s49, v23, 16 -; GFX10_3-NEXT: v_readlane_b32 s48, v23, 15 -; GFX10_3-NEXT: v_readlane_b32 s47, v23, 14 -; GFX10_3-NEXT: v_readlane_b32 s46, v23, 13 -; GFX10_3-NEXT: v_readlane_b32 s45, v23, 12 -; GFX10_3-NEXT: v_readlane_b32 s44, v23, 11 -; GFX10_3-NEXT: v_readlane_b32 s43, v23, 10 -; GFX10_3-NEXT: v_readlane_b32 s42, v23, 9 -; GFX10_3-NEXT: v_readlane_b32 s41, v23, 8 -; GFX10_3-NEXT: v_readlane_b32 s40, v23, 7 +; GFX10_3-NEXT: v_readlane_b32 s30, v23, 15 +; GFX10_3-NEXT: v_readlane_b32 s31, v23, 16 +; GFX10_3-NEXT: v_readlane_b32 s55, v23, 14 +; GFX10_3-NEXT: v_readlane_b32 s54, v23, 13 +; GFX10_3-NEXT: v_readlane_b32 s53, v23, 12 +; GFX10_3-NEXT: v_readlane_b32 s52, v23, 11 +; GFX10_3-NEXT: v_readlane_b32 s51, v23, 10 +; GFX10_3-NEXT: v_readlane_b32 s50, v23, 9 +; GFX10_3-NEXT: v_readlane_b32 s49, v23, 8 +; GFX10_3-NEXT: v_readlane_b32 s48, v23, 7 ; GFX10_3-NEXT: v_readlane_b32 s39, v23, 6 ; GFX10_3-NEXT: v_readlane_b32 s38, v23, 5 ; GFX10_3-NEXT: v_readlane_b32 s37, v23, 4 @@ -585,28 +437,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX11-NEXT: v_writelane_b32 v23, s37, 4 ; GFX11-NEXT: v_writelane_b32 v23, s38, 5 ; GFX11-NEXT: v_writelane_b32 v23, s39, 6 -; GFX11-NEXT: v_writelane_b32 v23, s40, 7 -; GFX11-NEXT: v_writelane_b32 v23, s41, 8 -; GFX11-NEXT: v_writelane_b32 v23, s42, 9 -; GFX11-NEXT: v_writelane_b32 v23, s43, 10 -; GFX11-NEXT: v_writelane_b32 v23, s44, 11 -; GFX11-NEXT: v_writelane_b32 v23, s45, 12 -; GFX11-NEXT: v_writelane_b32 v23, s46, 13 -; GFX11-NEXT: v_writelane_b32 v23, s47, 14 -; GFX11-NEXT: v_writelane_b32 v23, s48, 15 -; GFX11-NEXT: v_writelane_b32 v23, s49, 16 -; GFX11-NEXT: v_writelane_b32 v23, s50, 17 -; GFX11-NEXT: v_writelane_b32 v23, s51, 18 -; GFX11-NEXT: v_writelane_b32 v23, s52, 19 -; GFX11-NEXT: v_writelane_b32 v23, s53, 20 -; GFX11-NEXT: v_writelane_b32 v23, s54, 21 -; GFX11-NEXT: v_writelane_b32 v23, s55, 22 -; GFX11-NEXT: v_writelane_b32 v23, s56, 23 -; GFX11-NEXT: v_writelane_b32 v23, s57, 24 -; GFX11-NEXT: v_writelane_b32 v23, s58, 25 -; GFX11-NEXT: v_writelane_b32 v23, s59, 26 -; GFX11-NEXT: v_writelane_b32 v23, s30, 27 -; GFX11-NEXT: v_writelane_b32 v23, s31, 28 +; GFX11-NEXT: v_writelane_b32 v23, s48, 7 +; GFX11-NEXT: v_writelane_b32 v23, s49, 8 +; GFX11-NEXT: v_writelane_b32 v23, s50, 9 +; GFX11-NEXT: v_writelane_b32 v23, s51, 10 +; GFX11-NEXT: v_writelane_b32 v23, s52, 11 +; GFX11-NEXT: v_writelane_b32 v23, s53, 12 +; GFX11-NEXT: v_writelane_b32 v23, s54, 13 +; GFX11-NEXT: v_writelane_b32 v23, s55, 14 +; GFX11-NEXT: v_writelane_b32 v23, s30, 15 +; GFX11-NEXT: v_writelane_b32 v23, s31, 16 ; GFX11-NEXT: s_add_i32 s0, s32, 64 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -617,39 +457,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_addc_u32 s32, s32, 0x4040 -; GFX11-NEXT: s_bitcmp1_b32 s32, 0 -; GFX11-NEXT: s_bitset0_b32 s32, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s59, s32 -; GFX11-NEXT: s_addc_u32 s32, s32, 0xffffbfc0 -; GFX11-NEXT: s_bitcmp1_b32 s32, 0 -; GFX11-NEXT: s_bitset0_b32 s32, 0 +; GFX11-NEXT: s_addc_u32 s60, s32, 0x4040 +; GFX11-NEXT: s_bitcmp1_b32 s60, 0 +; GFX11-NEXT: s_bitset0_b32 s60, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s59, s60 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s30, v23, 27 -; GFX11-NEXT: v_readlane_b32 s31, v23, 28 -; GFX11-NEXT: v_readlane_b32 s59, v23, 26 -; GFX11-NEXT: v_readlane_b32 s58, v23, 25 -; GFX11-NEXT: v_readlane_b32 s57, v23, 24 -; GFX11-NEXT: v_readlane_b32 s56, v23, 23 -; GFX11-NEXT: v_readlane_b32 s55, v23, 22 -; GFX11-NEXT: v_readlane_b32 s54, v23, 21 -; GFX11-NEXT: v_readlane_b32 s53, v23, 20 -; GFX11-NEXT: v_readlane_b32 s52, v23, 19 -; GFX11-NEXT: v_readlane_b32 s51, v23, 18 -; GFX11-NEXT: v_readlane_b32 s50, v23, 17 -; GFX11-NEXT: v_readlane_b32 s49, v23, 16 -; GFX11-NEXT: v_readlane_b32 s48, v23, 15 -; GFX11-NEXT: v_readlane_b32 s47, v23, 14 -; GFX11-NEXT: v_readlane_b32 s46, v23, 13 -; GFX11-NEXT: v_readlane_b32 s45, v23, 12 -; GFX11-NEXT: v_readlane_b32 s44, v23, 11 -; GFX11-NEXT: v_readlane_b32 s43, v23, 10 -; GFX11-NEXT: v_readlane_b32 s42, v23, 9 -; GFX11-NEXT: v_readlane_b32 s41, v23, 8 -; GFX11-NEXT: v_readlane_b32 s40, v23, 7 +; GFX11-NEXT: v_readlane_b32 s30, v23, 15 +; GFX11-NEXT: v_readlane_b32 s31, v23, 16 +; GFX11-NEXT: v_readlane_b32 s55, v23, 14 +; GFX11-NEXT: v_readlane_b32 s54, v23, 13 +; GFX11-NEXT: v_readlane_b32 s53, v23, 12 +; GFX11-NEXT: v_readlane_b32 s52, v23, 11 +; GFX11-NEXT: v_readlane_b32 s51, v23, 10 +; GFX11-NEXT: v_readlane_b32 s50, v23, 9 +; GFX11-NEXT: v_readlane_b32 s49, v23, 8 +; GFX11-NEXT: v_readlane_b32 s48, v23, 7 ; GFX11-NEXT: v_readlane_b32 s39, v23, 6 ; GFX11-NEXT: v_readlane_b32 s38, v23, 5 ; GFX11-NEXT: v_readlane_b32 s37, v23, 4 @@ -682,28 +507,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX12-NEXT: v_writelane_b32 v23, s37, 4 ; GFX12-NEXT: v_writelane_b32 v23, s38, 5 ; GFX12-NEXT: v_writelane_b32 v23, s39, 6 -; GFX12-NEXT: v_writelane_b32 v23, s40, 7 -; GFX12-NEXT: v_writelane_b32 v23, s41, 8 -; GFX12-NEXT: v_writelane_b32 v23, s42, 9 -; GFX12-NEXT: v_writelane_b32 v23, s43, 10 -; GFX12-NEXT: v_writelane_b32 v23, s44, 11 -; GFX12-NEXT: v_writelane_b32 v23, s45, 12 -; GFX12-NEXT: v_writelane_b32 v23, s46, 13 -; GFX12-NEXT: v_writelane_b32 v23, s47, 14 -; GFX12-NEXT: v_writelane_b32 v23, s48, 15 -; GFX12-NEXT: v_writelane_b32 v23, s49, 16 -; GFX12-NEXT: v_writelane_b32 v23, s50, 17 -; GFX12-NEXT: v_writelane_b32 v23, s51, 18 -; GFX12-NEXT: v_writelane_b32 v23, s52, 19 -; GFX12-NEXT: v_writelane_b32 v23, s53, 20 -; GFX12-NEXT: v_writelane_b32 v23, s54, 21 -; GFX12-NEXT: v_writelane_b32 v23, s55, 22 -; GFX12-NEXT: v_writelane_b32 v23, s56, 23 -; GFX12-NEXT: v_writelane_b32 v23, s57, 24 -; GFX12-NEXT: v_writelane_b32 v23, s58, 25 -; GFX12-NEXT: v_writelane_b32 v23, s59, 26 -; GFX12-NEXT: v_writelane_b32 v23, s30, 27 -; GFX12-NEXT: v_writelane_b32 v23, s31, 28 +; GFX12-NEXT: v_writelane_b32 v23, s48, 7 +; GFX12-NEXT: v_writelane_b32 v23, s49, 8 +; GFX12-NEXT: v_writelane_b32 v23, s50, 9 +; GFX12-NEXT: v_writelane_b32 v23, s51, 10 +; GFX12-NEXT: v_writelane_b32 v23, s52, 11 +; GFX12-NEXT: v_writelane_b32 v23, s53, 12 +; GFX12-NEXT: v_writelane_b32 v23, s54, 13 +; GFX12-NEXT: v_writelane_b32 v23, s55, 14 +; GFX12-NEXT: v_writelane_b32 v23, s30, 15 +; GFX12-NEXT: v_writelane_b32 v23, s31, 16 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART @@ -712,41 +525,25 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_add_co_ci_u32 s32, s32, 0x4000 +; GFX12-NEXT: s_add_co_ci_u32 s60, s32, 0x4000 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_bitcmp1_b32 s32, 0 -; GFX12-NEXT: s_bitset0_b32 s32, 0 +; GFX12-NEXT: s_bitcmp1_b32 s60, 0 +; GFX12-NEXT: s_bitset0_b32 s60, 0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_mov_b32 s59, s32 -; GFX12-NEXT: s_add_co_ci_u32 s32, s32, 0xffffc000 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_bitcmp1_b32 s32, 0 -; GFX12-NEXT: s_bitset0_b32 s32, 0 +; GFX12-NEXT: s_mov_b32 s59, s60 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s30, v23, 27 -; GFX12-NEXT: v_readlane_b32 s31, v23, 28 -; GFX12-NEXT: v_readlane_b32 s59, v23, 26 -; GFX12-NEXT: v_readlane_b32 s58, v23, 25 -; GFX12-NEXT: v_readlane_b32 s57, v23, 24 -; GFX12-NEXT: v_readlane_b32 s56, v23, 23 -; GFX12-NEXT: v_readlane_b32 s55, v23, 22 -; GFX12-NEXT: v_readlane_b32 s54, v23, 21 -; GFX12-NEXT: v_readlane_b32 s53, v23, 20 -; GFX12-NEXT: v_readlane_b32 s52, v23, 19 -; GFX12-NEXT: v_readlane_b32 s51, v23, 18 -; GFX12-NEXT: v_readlane_b32 s50, v23, 17 -; GFX12-NEXT: v_readlane_b32 s49, v23, 16 -; GFX12-NEXT: v_readlane_b32 s48, v23, 15 -; GFX12-NEXT: v_readlane_b32 s47, v23, 14 -; GFX12-NEXT: v_readlane_b32 s46, v23, 13 -; GFX12-NEXT: v_readlane_b32 s45, v23, 12 -; GFX12-NEXT: v_readlane_b32 s44, v23, 11 -; GFX12-NEXT: v_readlane_b32 s43, v23, 10 -; GFX12-NEXT: v_readlane_b32 s42, v23, 9 -; GFX12-NEXT: v_readlane_b32 s41, v23, 8 -; GFX12-NEXT: v_readlane_b32 s40, v23, 7 +; GFX12-NEXT: v_readlane_b32 s30, v23, 15 +; GFX12-NEXT: v_readlane_b32 s31, v23, 16 +; GFX12-NEXT: v_readlane_b32 s55, v23, 14 +; GFX12-NEXT: v_readlane_b32 s54, v23, 13 +; GFX12-NEXT: v_readlane_b32 s53, v23, 12 +; GFX12-NEXT: v_readlane_b32 s52, v23, 11 +; GFX12-NEXT: v_readlane_b32 s51, v23, 10 +; GFX12-NEXT: v_readlane_b32 s50, v23, 9 +; GFX12-NEXT: v_readlane_b32 s49, v23, 8 +; GFX12-NEXT: v_readlane_b32 s48, v23, 7 ; GFX12-NEXT: v_readlane_b32 s39, v23, 6 ; GFX12-NEXT: v_readlane_b32 s38, v23, 5 ; GFX12-NEXT: v_readlane_b32 s37, v23, 4 @@ -759,7 +556,6 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) @@ -818,28 +614,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX7-NEXT: v_writelane_b32 v21, s37, 4 ; GFX7-NEXT: v_writelane_b32 v21, s38, 5 ; GFX7-NEXT: v_writelane_b32 v21, s39, 6 -; GFX7-NEXT: v_writelane_b32 v21, s40, 7 -; GFX7-NEXT: v_writelane_b32 v21, s41, 8 -; GFX7-NEXT: v_writelane_b32 v21, s42, 9 -; GFX7-NEXT: v_writelane_b32 v21, s43, 10 -; GFX7-NEXT: v_writelane_b32 v21, s44, 11 -; GFX7-NEXT: v_writelane_b32 v21, s45, 12 -; GFX7-NEXT: v_writelane_b32 v21, s46, 13 -; GFX7-NEXT: v_writelane_b32 v21, s47, 14 -; GFX7-NEXT: v_writelane_b32 v21, s48, 15 -; GFX7-NEXT: v_writelane_b32 v21, s49, 16 -; GFX7-NEXT: v_writelane_b32 v21, s50, 17 -; GFX7-NEXT: v_writelane_b32 v21, s51, 18 -; GFX7-NEXT: v_writelane_b32 v21, s52, 19 -; GFX7-NEXT: v_writelane_b32 v21, s53, 20 -; GFX7-NEXT: v_writelane_b32 v21, s54, 21 -; GFX7-NEXT: v_writelane_b32 v21, s55, 22 -; GFX7-NEXT: v_writelane_b32 v21, s56, 23 -; GFX7-NEXT: v_writelane_b32 v21, s57, 24 -; GFX7-NEXT: v_writelane_b32 v21, s58, 25 -; GFX7-NEXT: v_writelane_b32 v21, s59, 26 -; GFX7-NEXT: v_writelane_b32 v21, s30, 27 -; GFX7-NEXT: v_writelane_b32 v21, s31, 28 +; GFX7-NEXT: v_writelane_b32 v21, s48, 7 +; GFX7-NEXT: v_writelane_b32 v21, s49, 8 +; GFX7-NEXT: v_writelane_b32 v21, s50, 9 +; GFX7-NEXT: v_writelane_b32 v21, s51, 10 +; GFX7-NEXT: v_writelane_b32 v21, s52, 11 +; GFX7-NEXT: v_writelane_b32 v21, s53, 12 +; GFX7-NEXT: v_writelane_b32 v21, s54, 13 +; GFX7-NEXT: v_writelane_b32 v21, s55, 14 +; GFX7-NEXT: v_writelane_b32 v21, s30, 15 +; GFX7-NEXT: v_writelane_b32 v21, s31, 16 ; GFX7-NEXT: s_and_b64 s[4:5], 0, exec ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc @@ -850,28 +634,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX7-NEXT: ;;#ASMEND -; GFX7-NEXT: v_readlane_b32 s30, v21, 27 -; GFX7-NEXT: v_readlane_b32 s31, v21, 28 -; GFX7-NEXT: v_readlane_b32 s59, v21, 26 -; GFX7-NEXT: v_readlane_b32 s58, v21, 25 -; GFX7-NEXT: v_readlane_b32 s57, v21, 24 -; GFX7-NEXT: v_readlane_b32 s56, v21, 23 -; GFX7-NEXT: v_readlane_b32 s55, v21, 22 -; GFX7-NEXT: v_readlane_b32 s54, v21, 21 -; GFX7-NEXT: v_readlane_b32 s53, v21, 20 -; GFX7-NEXT: v_readlane_b32 s52, v21, 19 -; GFX7-NEXT: v_readlane_b32 s51, v21, 18 -; GFX7-NEXT: v_readlane_b32 s50, v21, 17 -; GFX7-NEXT: v_readlane_b32 s49, v21, 16 -; GFX7-NEXT: v_readlane_b32 s48, v21, 15 -; GFX7-NEXT: v_readlane_b32 s47, v21, 14 -; GFX7-NEXT: v_readlane_b32 s46, v21, 13 -; GFX7-NEXT: v_readlane_b32 s45, v21, 12 -; GFX7-NEXT: v_readlane_b32 s44, v21, 11 -; GFX7-NEXT: v_readlane_b32 s43, v21, 10 -; GFX7-NEXT: v_readlane_b32 s42, v21, 9 -; GFX7-NEXT: v_readlane_b32 s41, v21, 8 -; GFX7-NEXT: v_readlane_b32 s40, v21, 7 +; GFX7-NEXT: v_readlane_b32 s30, v21, 15 +; GFX7-NEXT: v_readlane_b32 s31, v21, 16 +; GFX7-NEXT: v_readlane_b32 s55, v21, 14 +; GFX7-NEXT: v_readlane_b32 s54, v21, 13 +; GFX7-NEXT: v_readlane_b32 s53, v21, 12 +; GFX7-NEXT: v_readlane_b32 s52, v21, 11 +; GFX7-NEXT: v_readlane_b32 s51, v21, 10 +; GFX7-NEXT: v_readlane_b32 s50, v21, 9 +; GFX7-NEXT: v_readlane_b32 s49, v21, 8 +; GFX7-NEXT: v_readlane_b32 s48, v21, 7 ; GFX7-NEXT: v_readlane_b32 s39, v21, 6 ; GFX7-NEXT: v_readlane_b32 s38, v21, 5 ; GFX7-NEXT: v_readlane_b32 s37, v21, 4 @@ -900,28 +672,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX8-NEXT: v_writelane_b32 v21, s37, 4 ; GFX8-NEXT: v_writelane_b32 v21, s38, 5 ; GFX8-NEXT: v_writelane_b32 v21, s39, 6 -; GFX8-NEXT: v_writelane_b32 v21, s40, 7 -; GFX8-NEXT: v_writelane_b32 v21, s41, 8 -; GFX8-NEXT: v_writelane_b32 v21, s42, 9 -; GFX8-NEXT: v_writelane_b32 v21, s43, 10 -; GFX8-NEXT: v_writelane_b32 v21, s44, 11 -; GFX8-NEXT: v_writelane_b32 v21, s45, 12 -; GFX8-NEXT: v_writelane_b32 v21, s46, 13 -; GFX8-NEXT: v_writelane_b32 v21, s47, 14 -; GFX8-NEXT: v_writelane_b32 v21, s48, 15 -; GFX8-NEXT: v_writelane_b32 v21, s49, 16 -; GFX8-NEXT: v_writelane_b32 v21, s50, 17 -; GFX8-NEXT: v_writelane_b32 v21, s51, 18 -; GFX8-NEXT: v_writelane_b32 v21, s52, 19 -; GFX8-NEXT: v_writelane_b32 v21, s53, 20 -; GFX8-NEXT: v_writelane_b32 v21, s54, 21 -; GFX8-NEXT: v_writelane_b32 v21, s55, 22 -; GFX8-NEXT: v_writelane_b32 v21, s56, 23 -; GFX8-NEXT: v_writelane_b32 v21, s57, 24 -; GFX8-NEXT: v_writelane_b32 v21, s58, 25 -; GFX8-NEXT: v_writelane_b32 v21, s59, 26 -; GFX8-NEXT: v_writelane_b32 v21, s30, 27 -; GFX8-NEXT: v_writelane_b32 v21, s31, 28 +; GFX8-NEXT: v_writelane_b32 v21, s48, 7 +; GFX8-NEXT: v_writelane_b32 v21, s49, 8 +; GFX8-NEXT: v_writelane_b32 v21, s50, 9 +; GFX8-NEXT: v_writelane_b32 v21, s51, 10 +; GFX8-NEXT: v_writelane_b32 v21, s52, 11 +; GFX8-NEXT: v_writelane_b32 v21, s53, 12 +; GFX8-NEXT: v_writelane_b32 v21, s54, 13 +; GFX8-NEXT: v_writelane_b32 v21, s55, 14 +; GFX8-NEXT: v_writelane_b32 v21, s30, 15 +; GFX8-NEXT: v_writelane_b32 v21, s31, 16 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc @@ -932,28 +692,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s30, v21, 27 -; GFX8-NEXT: v_readlane_b32 s31, v21, 28 -; GFX8-NEXT: v_readlane_b32 s59, v21, 26 -; GFX8-NEXT: v_readlane_b32 s58, v21, 25 -; GFX8-NEXT: v_readlane_b32 s57, v21, 24 -; GFX8-NEXT: v_readlane_b32 s56, v21, 23 -; GFX8-NEXT: v_readlane_b32 s55, v21, 22 -; GFX8-NEXT: v_readlane_b32 s54, v21, 21 -; GFX8-NEXT: v_readlane_b32 s53, v21, 20 -; GFX8-NEXT: v_readlane_b32 s52, v21, 19 -; GFX8-NEXT: v_readlane_b32 s51, v21, 18 -; GFX8-NEXT: v_readlane_b32 s50, v21, 17 -; GFX8-NEXT: v_readlane_b32 s49, v21, 16 -; GFX8-NEXT: v_readlane_b32 s48, v21, 15 -; GFX8-NEXT: v_readlane_b32 s47, v21, 14 -; GFX8-NEXT: v_readlane_b32 s46, v21, 13 -; GFX8-NEXT: v_readlane_b32 s45, v21, 12 -; GFX8-NEXT: v_readlane_b32 s44, v21, 11 -; GFX8-NEXT: v_readlane_b32 s43, v21, 10 -; GFX8-NEXT: v_readlane_b32 s42, v21, 9 -; GFX8-NEXT: v_readlane_b32 s41, v21, 8 -; GFX8-NEXT: v_readlane_b32 s40, v21, 7 +; GFX8-NEXT: v_readlane_b32 s30, v21, 15 +; GFX8-NEXT: v_readlane_b32 s31, v21, 16 +; GFX8-NEXT: v_readlane_b32 s55, v21, 14 +; GFX8-NEXT: v_readlane_b32 s54, v21, 13 +; GFX8-NEXT: v_readlane_b32 s53, v21, 12 +; GFX8-NEXT: v_readlane_b32 s52, v21, 11 +; GFX8-NEXT: v_readlane_b32 s51, v21, 10 +; GFX8-NEXT: v_readlane_b32 s50, v21, 9 +; GFX8-NEXT: v_readlane_b32 s49, v21, 8 +; GFX8-NEXT: v_readlane_b32 s48, v21, 7 ; GFX8-NEXT: v_readlane_b32 s39, v21, 6 ; GFX8-NEXT: v_readlane_b32 s38, v21, 5 ; GFX8-NEXT: v_readlane_b32 s37, v21, 4 @@ -982,28 +730,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX900-NEXT: v_writelane_b32 v21, s37, 4 ; GFX900-NEXT: v_writelane_b32 v21, s38, 5 ; GFX900-NEXT: v_writelane_b32 v21, s39, 6 -; GFX900-NEXT: v_writelane_b32 v21, s40, 7 -; GFX900-NEXT: v_writelane_b32 v21, s41, 8 -; GFX900-NEXT: v_writelane_b32 v21, s42, 9 -; GFX900-NEXT: v_writelane_b32 v21, s43, 10 -; GFX900-NEXT: v_writelane_b32 v21, s44, 11 -; GFX900-NEXT: v_writelane_b32 v21, s45, 12 -; GFX900-NEXT: v_writelane_b32 v21, s46, 13 -; GFX900-NEXT: v_writelane_b32 v21, s47, 14 -; GFX900-NEXT: v_writelane_b32 v21, s48, 15 -; GFX900-NEXT: v_writelane_b32 v21, s49, 16 -; GFX900-NEXT: v_writelane_b32 v21, s50, 17 -; GFX900-NEXT: v_writelane_b32 v21, s51, 18 -; GFX900-NEXT: v_writelane_b32 v21, s52, 19 -; GFX900-NEXT: v_writelane_b32 v21, s53, 20 -; GFX900-NEXT: v_writelane_b32 v21, s54, 21 -; GFX900-NEXT: v_writelane_b32 v21, s55, 22 -; GFX900-NEXT: v_writelane_b32 v21, s56, 23 -; GFX900-NEXT: v_writelane_b32 v21, s57, 24 -; GFX900-NEXT: v_writelane_b32 v21, s58, 25 -; GFX900-NEXT: v_writelane_b32 v21, s59, 26 -; GFX900-NEXT: v_writelane_b32 v21, s30, 27 -; GFX900-NEXT: v_writelane_b32 v21, s31, 28 +; GFX900-NEXT: v_writelane_b32 v21, s48, 7 +; GFX900-NEXT: v_writelane_b32 v21, s49, 8 +; GFX900-NEXT: v_writelane_b32 v21, s50, 9 +; GFX900-NEXT: v_writelane_b32 v21, s51, 10 +; GFX900-NEXT: v_writelane_b32 v21, s52, 11 +; GFX900-NEXT: v_writelane_b32 v21, s53, 12 +; GFX900-NEXT: v_writelane_b32 v21, s54, 13 +; GFX900-NEXT: v_writelane_b32 v21, s55, 14 +; GFX900-NEXT: v_writelane_b32 v21, s30, 15 +; GFX900-NEXT: v_writelane_b32 v21, s31, 16 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc @@ -1014,28 +750,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s30, v21, 27 -; GFX900-NEXT: v_readlane_b32 s31, v21, 28 -; GFX900-NEXT: v_readlane_b32 s59, v21, 26 -; GFX900-NEXT: v_readlane_b32 s58, v21, 25 -; GFX900-NEXT: v_readlane_b32 s57, v21, 24 -; GFX900-NEXT: v_readlane_b32 s56, v21, 23 -; GFX900-NEXT: v_readlane_b32 s55, v21, 22 -; GFX900-NEXT: v_readlane_b32 s54, v21, 21 -; GFX900-NEXT: v_readlane_b32 s53, v21, 20 -; GFX900-NEXT: v_readlane_b32 s52, v21, 19 -; GFX900-NEXT: v_readlane_b32 s51, v21, 18 -; GFX900-NEXT: v_readlane_b32 s50, v21, 17 -; GFX900-NEXT: v_readlane_b32 s49, v21, 16 -; GFX900-NEXT: v_readlane_b32 s48, v21, 15 -; GFX900-NEXT: v_readlane_b32 s47, v21, 14 -; GFX900-NEXT: v_readlane_b32 s46, v21, 13 -; GFX900-NEXT: v_readlane_b32 s45, v21, 12 -; GFX900-NEXT: v_readlane_b32 s44, v21, 11 -; GFX900-NEXT: v_readlane_b32 s43, v21, 10 -; GFX900-NEXT: v_readlane_b32 s42, v21, 9 -; GFX900-NEXT: v_readlane_b32 s41, v21, 8 -; GFX900-NEXT: v_readlane_b32 s40, v21, 7 +; GFX900-NEXT: v_readlane_b32 s30, v21, 15 +; GFX900-NEXT: v_readlane_b32 s31, v21, 16 +; GFX900-NEXT: v_readlane_b32 s55, v21, 14 +; GFX900-NEXT: v_readlane_b32 s54, v21, 13 +; GFX900-NEXT: v_readlane_b32 s53, v21, 12 +; GFX900-NEXT: v_readlane_b32 s52, v21, 11 +; GFX900-NEXT: v_readlane_b32 s51, v21, 10 +; GFX900-NEXT: v_readlane_b32 s50, v21, 9 +; GFX900-NEXT: v_readlane_b32 s49, v21, 8 +; GFX900-NEXT: v_readlane_b32 s48, v21, 7 ; GFX900-NEXT: v_readlane_b32 s39, v21, 6 ; GFX900-NEXT: v_readlane_b32 s38, v21, 5 ; GFX900-NEXT: v_readlane_b32 s37, v21, 4 @@ -1050,93 +774,65 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x4010 -; GFX940-NEXT: scratch_store_dword off, v21, s2 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v21, s33, 0 -; GFX940-NEXT: v_writelane_b32 v21, s34, 1 -; GFX940-NEXT: v_writelane_b32 v21, s35, 2 -; GFX940-NEXT: v_writelane_b32 v21, s36, 3 -; GFX940-NEXT: v_writelane_b32 v21, s37, 4 -; GFX940-NEXT: v_writelane_b32 v21, s38, 5 -; GFX940-NEXT: v_writelane_b32 v21, s39, 6 -; GFX940-NEXT: v_writelane_b32 v21, s40, 7 -; GFX940-NEXT: v_writelane_b32 v21, s41, 8 -; GFX940-NEXT: v_writelane_b32 v21, s42, 9 -; GFX940-NEXT: v_writelane_b32 v21, s43, 10 -; GFX940-NEXT: v_writelane_b32 v21, s44, 11 -; GFX940-NEXT: v_writelane_b32 v21, s45, 12 -; GFX940-NEXT: v_writelane_b32 v21, s46, 13 -; GFX940-NEXT: v_writelane_b32 v21, s47, 14 -; GFX940-NEXT: v_writelane_b32 v21, s48, 15 -; GFX940-NEXT: v_writelane_b32 v21, s49, 16 -; GFX940-NEXT: v_writelane_b32 v21, s50, 17 -; GFX940-NEXT: v_writelane_b32 v21, s51, 18 -; GFX940-NEXT: v_writelane_b32 v21, s52, 19 -; GFX940-NEXT: v_writelane_b32 v21, s53, 20 -; GFX940-NEXT: v_writelane_b32 v21, s54, 21 -; GFX940-NEXT: v_writelane_b32 v21, s55, 22 -; GFX940-NEXT: v_writelane_b32 v21, s56, 23 -; GFX940-NEXT: v_writelane_b32 v21, s57, 24 -; GFX940-NEXT: v_writelane_b32 v21, s58, 25 -; GFX940-NEXT: v_writelane_b32 v21, s59, 26 -; GFX940-NEXT: v_writelane_b32 v21, s60, 27 -; GFX940-NEXT: v_writelane_b32 v21, s61, 28 -; GFX940-NEXT: v_writelane_b32 v21, s30, 29 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v21, s31, 30 -; GFX940-NEXT: s_and_b64 s[60:61], 0, exec -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_addc_u32 s60, s32, 16 -; GFX940-NEXT: s_bitcmp1_b32 s60, 0 -; GFX940-NEXT: s_bitset0_b32 s60, 0 -; GFX940-NEXT: s_mov_b32 s59, s60 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s30, v21, 29 -; GFX940-NEXT: v_readlane_b32 s31, v21, 30 -; GFX940-NEXT: v_readlane_b32 s61, v21, 28 -; GFX940-NEXT: v_readlane_b32 s60, v21, 27 -; GFX940-NEXT: v_readlane_b32 s59, v21, 26 -; GFX940-NEXT: v_readlane_b32 s58, v21, 25 -; GFX940-NEXT: v_readlane_b32 s57, v21, 24 -; GFX940-NEXT: v_readlane_b32 s56, v21, 23 -; GFX940-NEXT: v_readlane_b32 s55, v21, 22 -; GFX940-NEXT: v_readlane_b32 s54, v21, 21 -; GFX940-NEXT: v_readlane_b32 s53, v21, 20 -; GFX940-NEXT: v_readlane_b32 s52, v21, 19 -; GFX940-NEXT: v_readlane_b32 s51, v21, 18 -; GFX940-NEXT: v_readlane_b32 s50, v21, 17 -; GFX940-NEXT: v_readlane_b32 s49, v21, 16 -; GFX940-NEXT: v_readlane_b32 s48, v21, 15 -; GFX940-NEXT: v_readlane_b32 s47, v21, 14 -; GFX940-NEXT: v_readlane_b32 s46, v21, 13 -; GFX940-NEXT: v_readlane_b32 s45, v21, 12 -; GFX940-NEXT: v_readlane_b32 s44, v21, 11 -; GFX940-NEXT: v_readlane_b32 s43, v21, 10 -; GFX940-NEXT: v_readlane_b32 s42, v21, 9 -; GFX940-NEXT: v_readlane_b32 s41, v21, 8 -; GFX940-NEXT: v_readlane_b32 s40, v21, 7 -; GFX940-NEXT: v_readlane_b32 s39, v21, 6 -; GFX940-NEXT: v_readlane_b32 s38, v21, 5 -; GFX940-NEXT: v_readlane_b32 s37, v21, 4 -; GFX940-NEXT: v_readlane_b32 s36, v21, 3 -; GFX940-NEXT: v_readlane_b32 s35, v21, 2 -; GFX940-NEXT: v_readlane_b32 s34, v21, 1 -; GFX940-NEXT: v_readlane_b32 s33, v21, 0 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x4010 -; GFX940-NEXT: scratch_load_dword v21, off, s2 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: s_add_i32 s2, s32, 0x4010 +; GFX942-NEXT: scratch_store_dword off, v21, s2 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v21, s33, 0 +; GFX942-NEXT: v_writelane_b32 v21, s34, 1 +; GFX942-NEXT: v_writelane_b32 v21, s35, 2 +; GFX942-NEXT: v_writelane_b32 v21, s36, 3 +; GFX942-NEXT: v_writelane_b32 v21, s37, 4 +; GFX942-NEXT: v_writelane_b32 v21, s38, 5 +; GFX942-NEXT: v_writelane_b32 v21, s39, 6 +; GFX942-NEXT: v_writelane_b32 v21, s48, 7 +; GFX942-NEXT: v_writelane_b32 v21, s49, 8 +; GFX942-NEXT: v_writelane_b32 v21, s50, 9 +; GFX942-NEXT: v_writelane_b32 v21, s51, 10 +; GFX942-NEXT: v_writelane_b32 v21, s52, 11 +; GFX942-NEXT: v_writelane_b32 v21, s53, 12 +; GFX942-NEXT: v_writelane_b32 v21, s54, 13 +; GFX942-NEXT: v_writelane_b32 v21, s55, 14 +; GFX942-NEXT: v_writelane_b32 v21, s30, 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v21, s31, 16 +; GFX942-NEXT: s_and_b64 s[60:61], 0, exec +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_addc_u32 s60, s32, 16 +; GFX942-NEXT: s_bitcmp1_b32 s60, 0 +; GFX942-NEXT: s_bitset0_b32 s60, 0 +; GFX942-NEXT: s_mov_b32 s59, s60 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s30, v21, 15 +; GFX942-NEXT: v_readlane_b32 s31, v21, 16 +; GFX942-NEXT: v_readlane_b32 s55, v21, 14 +; GFX942-NEXT: v_readlane_b32 s54, v21, 13 +; GFX942-NEXT: v_readlane_b32 s53, v21, 12 +; GFX942-NEXT: v_readlane_b32 s52, v21, 11 +; GFX942-NEXT: v_readlane_b32 s51, v21, 10 +; GFX942-NEXT: v_readlane_b32 s50, v21, 9 +; GFX942-NEXT: v_readlane_b32 s49, v21, 8 +; GFX942-NEXT: v_readlane_b32 s48, v21, 7 +; GFX942-NEXT: v_readlane_b32 s39, v21, 6 +; GFX942-NEXT: v_readlane_b32 s38, v21, 5 +; GFX942-NEXT: v_readlane_b32 s37, v21, 4 +; GFX942-NEXT: v_readlane_b32 s36, v21, 3 +; GFX942-NEXT: v_readlane_b32 s35, v21, 2 +; GFX942-NEXT: v_readlane_b32 s34, v21, 1 +; GFX942-NEXT: v_readlane_b32 s33, v21, 0 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: s_add_i32 s2, s32, 0x4010 +; GFX942-NEXT: scratch_load_dword v21, off, s2 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset: ; GFX10_1: ; %bb.0: @@ -1153,28 +849,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX10_1-NEXT: v_writelane_b32 v21, s37, 4 ; GFX10_1-NEXT: v_writelane_b32 v21, s38, 5 ; GFX10_1-NEXT: v_writelane_b32 v21, s39, 6 -; GFX10_1-NEXT: v_writelane_b32 v21, s40, 7 -; GFX10_1-NEXT: v_writelane_b32 v21, s41, 8 -; GFX10_1-NEXT: v_writelane_b32 v21, s42, 9 -; GFX10_1-NEXT: v_writelane_b32 v21, s43, 10 -; GFX10_1-NEXT: v_writelane_b32 v21, s44, 11 -; GFX10_1-NEXT: v_writelane_b32 v21, s45, 12 -; GFX10_1-NEXT: v_writelane_b32 v21, s46, 13 -; GFX10_1-NEXT: v_writelane_b32 v21, s47, 14 -; GFX10_1-NEXT: v_writelane_b32 v21, s48, 15 -; GFX10_1-NEXT: v_writelane_b32 v21, s49, 16 -; GFX10_1-NEXT: v_writelane_b32 v21, s50, 17 -; GFX10_1-NEXT: v_writelane_b32 v21, s51, 18 -; GFX10_1-NEXT: v_writelane_b32 v21, s52, 19 -; GFX10_1-NEXT: v_writelane_b32 v21, s53, 20 -; GFX10_1-NEXT: v_writelane_b32 v21, s54, 21 -; GFX10_1-NEXT: v_writelane_b32 v21, s55, 22 -; GFX10_1-NEXT: v_writelane_b32 v21, s56, 23 -; GFX10_1-NEXT: v_writelane_b32 v21, s57, 24 -; GFX10_1-NEXT: v_writelane_b32 v21, s58, 25 -; GFX10_1-NEXT: v_writelane_b32 v21, s59, 26 -; GFX10_1-NEXT: v_writelane_b32 v21, s30, 27 -; GFX10_1-NEXT: v_writelane_b32 v21, s31, 28 +; GFX10_1-NEXT: v_writelane_b32 v21, s48, 7 +; GFX10_1-NEXT: v_writelane_b32 v21, s49, 8 +; GFX10_1-NEXT: v_writelane_b32 v21, s50, 9 +; GFX10_1-NEXT: v_writelane_b32 v21, s51, 10 +; GFX10_1-NEXT: v_writelane_b32 v21, s52, 11 +; GFX10_1-NEXT: v_writelane_b32 v21, s53, 12 +; GFX10_1-NEXT: v_writelane_b32 v21, s54, 13 +; GFX10_1-NEXT: v_writelane_b32 v21, s55, 14 +; GFX10_1-NEXT: v_writelane_b32 v21, s30, 15 +; GFX10_1-NEXT: v_writelane_b32 v21, s31, 16 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX10_1-NEXT: ;;#ASMEND @@ -1185,28 +869,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s30, v21, 27 -; GFX10_1-NEXT: v_readlane_b32 s31, v21, 28 -; GFX10_1-NEXT: v_readlane_b32 s59, v21, 26 -; GFX10_1-NEXT: v_readlane_b32 s58, v21, 25 -; GFX10_1-NEXT: v_readlane_b32 s57, v21, 24 -; GFX10_1-NEXT: v_readlane_b32 s56, v21, 23 -; GFX10_1-NEXT: v_readlane_b32 s55, v21, 22 -; GFX10_1-NEXT: v_readlane_b32 s54, v21, 21 -; GFX10_1-NEXT: v_readlane_b32 s53, v21, 20 -; GFX10_1-NEXT: v_readlane_b32 s52, v21, 19 -; GFX10_1-NEXT: v_readlane_b32 s51, v21, 18 -; GFX10_1-NEXT: v_readlane_b32 s50, v21, 17 -; GFX10_1-NEXT: v_readlane_b32 s49, v21, 16 -; GFX10_1-NEXT: v_readlane_b32 s48, v21, 15 -; GFX10_1-NEXT: v_readlane_b32 s47, v21, 14 -; GFX10_1-NEXT: v_readlane_b32 s46, v21, 13 -; GFX10_1-NEXT: v_readlane_b32 s45, v21, 12 -; GFX10_1-NEXT: v_readlane_b32 s44, v21, 11 -; GFX10_1-NEXT: v_readlane_b32 s43, v21, 10 -; GFX10_1-NEXT: v_readlane_b32 s42, v21, 9 -; GFX10_1-NEXT: v_readlane_b32 s41, v21, 8 -; GFX10_1-NEXT: v_readlane_b32 s40, v21, 7 +; GFX10_1-NEXT: v_readlane_b32 s30, v21, 15 +; GFX10_1-NEXT: v_readlane_b32 s31, v21, 16 +; GFX10_1-NEXT: v_readlane_b32 s55, v21, 14 +; GFX10_1-NEXT: v_readlane_b32 s54, v21, 13 +; GFX10_1-NEXT: v_readlane_b32 s53, v21, 12 +; GFX10_1-NEXT: v_readlane_b32 s52, v21, 11 +; GFX10_1-NEXT: v_readlane_b32 s51, v21, 10 +; GFX10_1-NEXT: v_readlane_b32 s50, v21, 9 +; GFX10_1-NEXT: v_readlane_b32 s49, v21, 8 +; GFX10_1-NEXT: v_readlane_b32 s48, v21, 7 ; GFX10_1-NEXT: v_readlane_b32 s39, v21, 6 ; GFX10_1-NEXT: v_readlane_b32 s38, v21, 5 ; GFX10_1-NEXT: v_readlane_b32 s37, v21, 4 @@ -1236,28 +908,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX10_3-NEXT: v_writelane_b32 v21, s37, 4 ; GFX10_3-NEXT: v_writelane_b32 v21, s38, 5 ; GFX10_3-NEXT: v_writelane_b32 v21, s39, 6 -; GFX10_3-NEXT: v_writelane_b32 v21, s40, 7 -; GFX10_3-NEXT: v_writelane_b32 v21, s41, 8 -; GFX10_3-NEXT: v_writelane_b32 v21, s42, 9 -; GFX10_3-NEXT: v_writelane_b32 v21, s43, 10 -; GFX10_3-NEXT: v_writelane_b32 v21, s44, 11 -; GFX10_3-NEXT: v_writelane_b32 v21, s45, 12 -; GFX10_3-NEXT: v_writelane_b32 v21, s46, 13 -; GFX10_3-NEXT: v_writelane_b32 v21, s47, 14 -; GFX10_3-NEXT: v_writelane_b32 v21, s48, 15 -; GFX10_3-NEXT: v_writelane_b32 v21, s49, 16 -; GFX10_3-NEXT: v_writelane_b32 v21, s50, 17 -; GFX10_3-NEXT: v_writelane_b32 v21, s51, 18 -; GFX10_3-NEXT: v_writelane_b32 v21, s52, 19 -; GFX10_3-NEXT: v_writelane_b32 v21, s53, 20 -; GFX10_3-NEXT: v_writelane_b32 v21, s54, 21 -; GFX10_3-NEXT: v_writelane_b32 v21, s55, 22 -; GFX10_3-NEXT: v_writelane_b32 v21, s56, 23 -; GFX10_3-NEXT: v_writelane_b32 v21, s57, 24 -; GFX10_3-NEXT: v_writelane_b32 v21, s58, 25 -; GFX10_3-NEXT: v_writelane_b32 v21, s59, 26 -; GFX10_3-NEXT: v_writelane_b32 v21, s30, 27 -; GFX10_3-NEXT: v_writelane_b32 v21, s31, 28 +; GFX10_3-NEXT: v_writelane_b32 v21, s48, 7 +; GFX10_3-NEXT: v_writelane_b32 v21, s49, 8 +; GFX10_3-NEXT: v_writelane_b32 v21, s50, 9 +; GFX10_3-NEXT: v_writelane_b32 v21, s51, 10 +; GFX10_3-NEXT: v_writelane_b32 v21, s52, 11 +; GFX10_3-NEXT: v_writelane_b32 v21, s53, 12 +; GFX10_3-NEXT: v_writelane_b32 v21, s54, 13 +; GFX10_3-NEXT: v_writelane_b32 v21, s55, 14 +; GFX10_3-NEXT: v_writelane_b32 v21, s30, 15 +; GFX10_3-NEXT: v_writelane_b32 v21, s31, 16 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX10_3-NEXT: ;;#ASMEND @@ -1268,28 +928,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s30, v21, 27 -; GFX10_3-NEXT: v_readlane_b32 s31, v21, 28 -; GFX10_3-NEXT: v_readlane_b32 s59, v21, 26 -; GFX10_3-NEXT: v_readlane_b32 s58, v21, 25 -; GFX10_3-NEXT: v_readlane_b32 s57, v21, 24 -; GFX10_3-NEXT: v_readlane_b32 s56, v21, 23 -; GFX10_3-NEXT: v_readlane_b32 s55, v21, 22 -; GFX10_3-NEXT: v_readlane_b32 s54, v21, 21 -; GFX10_3-NEXT: v_readlane_b32 s53, v21, 20 -; GFX10_3-NEXT: v_readlane_b32 s52, v21, 19 -; GFX10_3-NEXT: v_readlane_b32 s51, v21, 18 -; GFX10_3-NEXT: v_readlane_b32 s50, v21, 17 -; GFX10_3-NEXT: v_readlane_b32 s49, v21, 16 -; GFX10_3-NEXT: v_readlane_b32 s48, v21, 15 -; GFX10_3-NEXT: v_readlane_b32 s47, v21, 14 -; GFX10_3-NEXT: v_readlane_b32 s46, v21, 13 -; GFX10_3-NEXT: v_readlane_b32 s45, v21, 12 -; GFX10_3-NEXT: v_readlane_b32 s44, v21, 11 -; GFX10_3-NEXT: v_readlane_b32 s43, v21, 10 -; GFX10_3-NEXT: v_readlane_b32 s42, v21, 9 -; GFX10_3-NEXT: v_readlane_b32 s41, v21, 8 -; GFX10_3-NEXT: v_readlane_b32 s40, v21, 7 +; GFX10_3-NEXT: v_readlane_b32 s30, v21, 15 +; GFX10_3-NEXT: v_readlane_b32 s31, v21, 16 +; GFX10_3-NEXT: v_readlane_b32 s55, v21, 14 +; GFX10_3-NEXT: v_readlane_b32 s54, v21, 13 +; GFX10_3-NEXT: v_readlane_b32 s53, v21, 12 +; GFX10_3-NEXT: v_readlane_b32 s52, v21, 11 +; GFX10_3-NEXT: v_readlane_b32 s51, v21, 10 +; GFX10_3-NEXT: v_readlane_b32 s50, v21, 9 +; GFX10_3-NEXT: v_readlane_b32 s49, v21, 8 +; GFX10_3-NEXT: v_readlane_b32 s48, v21, 7 ; GFX10_3-NEXT: v_readlane_b32 s39, v21, 6 ; GFX10_3-NEXT: v_readlane_b32 s38, v21, 5 ; GFX10_3-NEXT: v_readlane_b32 s37, v21, 4 @@ -1318,66 +966,38 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX11-NEXT: v_writelane_b32 v21, s37, 4 ; GFX11-NEXT: v_writelane_b32 v21, s38, 5 ; GFX11-NEXT: v_writelane_b32 v21, s39, 6 -; GFX11-NEXT: v_writelane_b32 v21, s40, 7 -; GFX11-NEXT: v_writelane_b32 v21, s41, 8 -; GFX11-NEXT: v_writelane_b32 v21, s42, 9 -; GFX11-NEXT: v_writelane_b32 v21, s43, 10 -; GFX11-NEXT: v_writelane_b32 v21, s44, 11 -; GFX11-NEXT: v_writelane_b32 v21, s45, 12 -; GFX11-NEXT: v_writelane_b32 v21, s46, 13 -; GFX11-NEXT: v_writelane_b32 v21, s47, 14 -; GFX11-NEXT: v_writelane_b32 v21, s48, 15 -; GFX11-NEXT: v_writelane_b32 v21, s49, 16 -; GFX11-NEXT: v_writelane_b32 v21, s50, 17 -; GFX11-NEXT: v_writelane_b32 v21, s51, 18 -; GFX11-NEXT: v_writelane_b32 v21, s52, 19 -; GFX11-NEXT: v_writelane_b32 v21, s53, 20 -; GFX11-NEXT: v_writelane_b32 v21, s54, 21 -; GFX11-NEXT: v_writelane_b32 v21, s55, 22 -; GFX11-NEXT: v_writelane_b32 v21, s56, 23 -; GFX11-NEXT: v_writelane_b32 v21, s57, 24 -; GFX11-NEXT: v_writelane_b32 v21, s58, 25 -; GFX11-NEXT: v_writelane_b32 v21, s59, 26 -; GFX11-NEXT: v_writelane_b32 v21, s30, 27 -; GFX11-NEXT: v_writelane_b32 v21, s31, 28 +; GFX11-NEXT: v_writelane_b32 v21, s48, 7 +; GFX11-NEXT: v_writelane_b32 v21, s49, 8 +; GFX11-NEXT: v_writelane_b32 v21, s50, 9 +; GFX11-NEXT: v_writelane_b32 v21, s51, 10 +; GFX11-NEXT: v_writelane_b32 v21, s52, 11 +; GFX11-NEXT: v_writelane_b32 v21, s53, 12 +; GFX11-NEXT: v_writelane_b32 v21, s54, 13 +; GFX11-NEXT: v_writelane_b32 v21, s55, 14 +; GFX11-NEXT: v_writelane_b32 v21, s30, 15 +; GFX11-NEXT: v_writelane_b32 v21, s31, 16 +; GFX11-NEXT: s_and_b32 s59, 0, exec_lo ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_and_b32 s59, 0, exec_lo -; GFX11-NEXT: s_addc_u32 s32, s32, 16 +; GFX11-NEXT: s_addc_u32 s60, s32, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_bitcmp1_b32 s32, 0 -; GFX11-NEXT: s_bitset0_b32 s32, 0 -; GFX11-NEXT: s_mov_b32 s59, s32 -; GFX11-NEXT: s_addc_u32 s32, s32, -16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_bitcmp1_b32 s32, 0 -; GFX11-NEXT: s_bitset0_b32 s32, 0 +; GFX11-NEXT: s_bitcmp1_b32 s60, 0 +; GFX11-NEXT: s_bitset0_b32 s60, 0 +; GFX11-NEXT: s_mov_b32 s59, s60 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s30, v21, 27 -; GFX11-NEXT: v_readlane_b32 s31, v21, 28 -; GFX11-NEXT: v_readlane_b32 s59, v21, 26 -; GFX11-NEXT: v_readlane_b32 s58, v21, 25 -; GFX11-NEXT: v_readlane_b32 s57, v21, 24 -; GFX11-NEXT: v_readlane_b32 s56, v21, 23 -; GFX11-NEXT: v_readlane_b32 s55, v21, 22 -; GFX11-NEXT: v_readlane_b32 s54, v21, 21 -; GFX11-NEXT: v_readlane_b32 s53, v21, 20 -; GFX11-NEXT: v_readlane_b32 s52, v21, 19 -; GFX11-NEXT: v_readlane_b32 s51, v21, 18 -; GFX11-NEXT: v_readlane_b32 s50, v21, 17 -; GFX11-NEXT: v_readlane_b32 s49, v21, 16 -; GFX11-NEXT: v_readlane_b32 s48, v21, 15 -; GFX11-NEXT: v_readlane_b32 s47, v21, 14 -; GFX11-NEXT: v_readlane_b32 s46, v21, 13 -; GFX11-NEXT: v_readlane_b32 s45, v21, 12 -; GFX11-NEXT: v_readlane_b32 s44, v21, 11 -; GFX11-NEXT: v_readlane_b32 s43, v21, 10 -; GFX11-NEXT: v_readlane_b32 s42, v21, 9 -; GFX11-NEXT: v_readlane_b32 s41, v21, 8 -; GFX11-NEXT: v_readlane_b32 s40, v21, 7 +; GFX11-NEXT: v_readlane_b32 s30, v21, 15 +; GFX11-NEXT: v_readlane_b32 s31, v21, 16 +; GFX11-NEXT: v_readlane_b32 s55, v21, 14 +; GFX11-NEXT: v_readlane_b32 s54, v21, 13 +; GFX11-NEXT: v_readlane_b32 s53, v21, 12 +; GFX11-NEXT: v_readlane_b32 s52, v21, 11 +; GFX11-NEXT: v_readlane_b32 s51, v21, 10 +; GFX11-NEXT: v_readlane_b32 s50, v21, 9 +; GFX11-NEXT: v_readlane_b32 s49, v21, 8 +; GFX11-NEXT: v_readlane_b32 s48, v21, 7 ; GFX11-NEXT: v_readlane_b32 s39, v21, 6 ; GFX11-NEXT: v_readlane_b32 s38, v21, 5 ; GFX11-NEXT: v_readlane_b32 s37, v21, 4 @@ -1410,28 +1030,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX12-NEXT: v_writelane_b32 v21, s37, 4 ; GFX12-NEXT: v_writelane_b32 v21, s38, 5 ; GFX12-NEXT: v_writelane_b32 v21, s39, 6 -; GFX12-NEXT: v_writelane_b32 v21, s40, 7 -; GFX12-NEXT: v_writelane_b32 v21, s41, 8 -; GFX12-NEXT: v_writelane_b32 v21, s42, 9 -; GFX12-NEXT: v_writelane_b32 v21, s43, 10 -; GFX12-NEXT: v_writelane_b32 v21, s44, 11 -; GFX12-NEXT: v_writelane_b32 v21, s45, 12 -; GFX12-NEXT: v_writelane_b32 v21, s46, 13 -; GFX12-NEXT: v_writelane_b32 v21, s47, 14 -; GFX12-NEXT: v_writelane_b32 v21, s48, 15 -; GFX12-NEXT: v_writelane_b32 v21, s49, 16 -; GFX12-NEXT: v_writelane_b32 v21, s50, 17 -; GFX12-NEXT: v_writelane_b32 v21, s51, 18 -; GFX12-NEXT: v_writelane_b32 v21, s52, 19 -; GFX12-NEXT: v_writelane_b32 v21, s53, 20 -; GFX12-NEXT: v_writelane_b32 v21, s54, 21 -; GFX12-NEXT: v_writelane_b32 v21, s55, 22 -; GFX12-NEXT: v_writelane_b32 v21, s56, 23 -; GFX12-NEXT: v_writelane_b32 v21, s57, 24 -; GFX12-NEXT: v_writelane_b32 v21, s58, 25 -; GFX12-NEXT: v_writelane_b32 v21, s59, 26 -; GFX12-NEXT: v_writelane_b32 v21, s30, 27 -; GFX12-NEXT: v_writelane_b32 v21, s31, 28 +; GFX12-NEXT: v_writelane_b32 v21, s48, 7 +; GFX12-NEXT: v_writelane_b32 v21, s49, 8 +; GFX12-NEXT: v_writelane_b32 v21, s50, 9 +; GFX12-NEXT: v_writelane_b32 v21, s51, 10 +; GFX12-NEXT: v_writelane_b32 v21, s52, 11 +; GFX12-NEXT: v_writelane_b32 v21, s53, 12 +; GFX12-NEXT: v_writelane_b32 v21, s54, 13 +; GFX12-NEXT: v_writelane_b32 v21, s55, 14 +; GFX12-NEXT: v_writelane_b32 v21, s30, 15 +; GFX12-NEXT: v_writelane_b32 v21, s31, 16 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX12-NEXT: ;;#ASMEND @@ -1440,28 +1048,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s30, v21, 27 -; GFX12-NEXT: v_readlane_b32 s31, v21, 28 -; GFX12-NEXT: v_readlane_b32 s59, v21, 26 -; GFX12-NEXT: v_readlane_b32 s58, v21, 25 -; GFX12-NEXT: v_readlane_b32 s57, v21, 24 -; GFX12-NEXT: v_readlane_b32 s56, v21, 23 -; GFX12-NEXT: v_readlane_b32 s55, v21, 22 -; GFX12-NEXT: v_readlane_b32 s54, v21, 21 -; GFX12-NEXT: v_readlane_b32 s53, v21, 20 -; GFX12-NEXT: v_readlane_b32 s52, v21, 19 -; GFX12-NEXT: v_readlane_b32 s51, v21, 18 -; GFX12-NEXT: v_readlane_b32 s50, v21, 17 -; GFX12-NEXT: v_readlane_b32 s49, v21, 16 -; GFX12-NEXT: v_readlane_b32 s48, v21, 15 -; GFX12-NEXT: v_readlane_b32 s47, v21, 14 -; GFX12-NEXT: v_readlane_b32 s46, v21, 13 -; GFX12-NEXT: v_readlane_b32 s45, v21, 12 -; GFX12-NEXT: v_readlane_b32 s44, v21, 11 -; GFX12-NEXT: v_readlane_b32 s43, v21, 10 -; GFX12-NEXT: v_readlane_b32 s42, v21, 9 -; GFX12-NEXT: v_readlane_b32 s41, v21, 8 -; GFX12-NEXT: v_readlane_b32 s40, v21, 7 +; GFX12-NEXT: v_readlane_b32 s30, v21, 15 +; GFX12-NEXT: v_readlane_b32 s31, v21, 16 +; GFX12-NEXT: v_readlane_b32 s55, v21, 14 +; GFX12-NEXT: v_readlane_b32 s54, v21, 13 +; GFX12-NEXT: v_readlane_b32 s53, v21, 12 +; GFX12-NEXT: v_readlane_b32 s52, v21, 11 +; GFX12-NEXT: v_readlane_b32 s51, v21, 10 +; GFX12-NEXT: v_readlane_b32 s50, v21, 9 +; GFX12-NEXT: v_readlane_b32 s49, v21, 8 +; GFX12-NEXT: v_readlane_b32 s48, v21, 7 ; GFX12-NEXT: v_readlane_b32 s39, v21, 6 ; GFX12-NEXT: v_readlane_b32 s38, v21, 5 ; GFX12-NEXT: v_readlane_b32 s37, v21, 4 @@ -1474,7 +1070,6 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 16, addrspace(5) @@ -1525,8 +1120,8 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: s_add_i32 s6, s32, 0x201100 ; GFX7-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v23, s28, 28 -; GFX7-NEXT: v_writelane_b32 v23, s29, 29 +; GFX7-NEXT: v_writelane_b32 v23, s28, 17 +; GFX7-NEXT: v_writelane_b32 v23, s29, 18 ; GFX7-NEXT: v_writelane_b32 v23, s33, 0 ; GFX7-NEXT: v_writelane_b32 v23, s34, 1 ; GFX7-NEXT: v_writelane_b32 v23, s35, 2 @@ -1534,27 +1129,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: v_writelane_b32 v23, s37, 4 ; GFX7-NEXT: v_writelane_b32 v23, s38, 5 ; GFX7-NEXT: v_writelane_b32 v23, s39, 6 -; GFX7-NEXT: v_writelane_b32 v23, s40, 7 -; GFX7-NEXT: v_writelane_b32 v23, s41, 8 -; GFX7-NEXT: v_writelane_b32 v23, s42, 9 -; GFX7-NEXT: v_writelane_b32 v23, s43, 10 -; GFX7-NEXT: v_writelane_b32 v23, s44, 11 -; GFX7-NEXT: v_writelane_b32 v23, s45, 12 -; GFX7-NEXT: v_writelane_b32 v23, s46, 13 -; GFX7-NEXT: v_writelane_b32 v23, s47, 14 -; GFX7-NEXT: v_writelane_b32 v23, s48, 15 -; GFX7-NEXT: v_writelane_b32 v23, s49, 16 -; GFX7-NEXT: v_writelane_b32 v23, s50, 17 -; GFX7-NEXT: v_writelane_b32 v23, s51, 18 -; GFX7-NEXT: v_writelane_b32 v23, s52, 19 -; GFX7-NEXT: v_writelane_b32 v23, s53, 20 -; GFX7-NEXT: v_writelane_b32 v23, s54, 21 -; GFX7-NEXT: v_writelane_b32 v23, s55, 22 -; GFX7-NEXT: v_writelane_b32 v23, s56, 23 -; GFX7-NEXT: v_writelane_b32 v23, s57, 24 -; GFX7-NEXT: v_writelane_b32 v23, s59, 25 -; GFX7-NEXT: v_writelane_b32 v23, s30, 26 -; GFX7-NEXT: v_writelane_b32 v23, s31, 27 +; GFX7-NEXT: v_writelane_b32 v23, s48, 7 +; GFX7-NEXT: v_writelane_b32 v23, s49, 8 +; GFX7-NEXT: v_writelane_b32 v23, s50, 9 +; GFX7-NEXT: v_writelane_b32 v23, s51, 10 +; GFX7-NEXT: v_writelane_b32 v23, s52, 11 +; GFX7-NEXT: v_writelane_b32 v23, s53, 12 +; GFX7-NEXT: v_writelane_b32 v23, s54, 13 +; GFX7-NEXT: v_writelane_b32 v23, s55, 14 +; GFX7-NEXT: v_writelane_b32 v23, s30, 15 +; GFX7-NEXT: v_writelane_b32 v23, s31, 16 ; GFX7-NEXT: s_lshr_b32 s5, s32, 6 ; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6 ; GFX7-NEXT: s_add_i32 s4, s5, 0x4240 @@ -1572,27 +1156,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX7-NEXT: ;;#ASMEND -; GFX7-NEXT: v_readlane_b32 s30, v23, 26 -; GFX7-NEXT: v_readlane_b32 s31, v23, 27 -; GFX7-NEXT: v_readlane_b32 s59, v23, 25 -; GFX7-NEXT: v_readlane_b32 s57, v23, 24 -; GFX7-NEXT: v_readlane_b32 s56, v23, 23 -; GFX7-NEXT: v_readlane_b32 s55, v23, 22 -; GFX7-NEXT: v_readlane_b32 s54, v23, 21 -; GFX7-NEXT: v_readlane_b32 s53, v23, 20 -; GFX7-NEXT: v_readlane_b32 s52, v23, 19 -; GFX7-NEXT: v_readlane_b32 s51, v23, 18 -; GFX7-NEXT: v_readlane_b32 s50, v23, 17 -; GFX7-NEXT: v_readlane_b32 s49, v23, 16 -; GFX7-NEXT: v_readlane_b32 s48, v23, 15 -; GFX7-NEXT: v_readlane_b32 s47, v23, 14 -; GFX7-NEXT: v_readlane_b32 s46, v23, 13 -; GFX7-NEXT: v_readlane_b32 s45, v23, 12 -; GFX7-NEXT: v_readlane_b32 s44, v23, 11 -; GFX7-NEXT: v_readlane_b32 s43, v23, 10 -; GFX7-NEXT: v_readlane_b32 s42, v23, 9 -; GFX7-NEXT: v_readlane_b32 s41, v23, 8 -; GFX7-NEXT: v_readlane_b32 s40, v23, 7 +; GFX7-NEXT: v_readlane_b32 s30, v23, 15 +; GFX7-NEXT: v_readlane_b32 s31, v23, 16 +; GFX7-NEXT: v_readlane_b32 s55, v23, 14 +; GFX7-NEXT: v_readlane_b32 s54, v23, 13 +; GFX7-NEXT: v_readlane_b32 s53, v23, 12 +; GFX7-NEXT: v_readlane_b32 s52, v23, 11 +; GFX7-NEXT: v_readlane_b32 s51, v23, 10 +; GFX7-NEXT: v_readlane_b32 s50, v23, 9 +; GFX7-NEXT: v_readlane_b32 s49, v23, 8 +; GFX7-NEXT: v_readlane_b32 s48, v23, 7 ; GFX7-NEXT: v_readlane_b32 s39, v23, 6 ; GFX7-NEXT: v_readlane_b32 s38, v23, 5 ; GFX7-NEXT: v_readlane_b32 s37, v23, 4 @@ -1600,8 +1173,8 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: v_readlane_b32 s35, v23, 2 ; GFX7-NEXT: v_readlane_b32 s34, v23, 1 ; GFX7-NEXT: v_readlane_b32 s33, v23, 0 -; GFX7-NEXT: v_readlane_b32 s28, v23, 28 -; GFX7-NEXT: v_readlane_b32 s29, v23, 29 +; GFX7-NEXT: v_readlane_b32 s28, v23, 17 +; GFX7-NEXT: v_readlane_b32 s29, v23, 18 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: s_add_i32 s6, s32, 0x201000 ; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload @@ -1625,27 +1198,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX8-NEXT: v_writelane_b32 v22, s37, 4 ; GFX8-NEXT: v_writelane_b32 v22, s38, 5 ; GFX8-NEXT: v_writelane_b32 v22, s39, 6 -; GFX8-NEXT: v_writelane_b32 v22, s40, 7 -; GFX8-NEXT: v_writelane_b32 v22, s41, 8 -; GFX8-NEXT: v_writelane_b32 v22, s42, 9 -; GFX8-NEXT: v_writelane_b32 v22, s43, 10 -; GFX8-NEXT: v_writelane_b32 v22, s44, 11 -; GFX8-NEXT: v_writelane_b32 v22, s45, 12 -; GFX8-NEXT: v_writelane_b32 v22, s46, 13 -; GFX8-NEXT: v_writelane_b32 v22, s47, 14 -; GFX8-NEXT: v_writelane_b32 v22, s48, 15 -; GFX8-NEXT: v_writelane_b32 v22, s49, 16 -; GFX8-NEXT: v_writelane_b32 v22, s50, 17 -; GFX8-NEXT: v_writelane_b32 v22, s51, 18 -; GFX8-NEXT: v_writelane_b32 v22, s52, 19 -; GFX8-NEXT: v_writelane_b32 v22, s53, 20 -; GFX8-NEXT: v_writelane_b32 v22, s54, 21 -; GFX8-NEXT: v_writelane_b32 v22, s55, 22 -; GFX8-NEXT: v_writelane_b32 v22, s56, 23 -; GFX8-NEXT: v_writelane_b32 v22, s57, 24 -; GFX8-NEXT: v_writelane_b32 v22, s59, 25 -; GFX8-NEXT: v_writelane_b32 v22, s30, 26 -; GFX8-NEXT: v_writelane_b32 v22, s31, 27 +; GFX8-NEXT: v_writelane_b32 v22, s48, 7 +; GFX8-NEXT: v_writelane_b32 v22, s49, 8 +; GFX8-NEXT: v_writelane_b32 v22, s50, 9 +; GFX8-NEXT: v_writelane_b32 v22, s51, 10 +; GFX8-NEXT: v_writelane_b32 v22, s52, 11 +; GFX8-NEXT: v_writelane_b32 v22, s53, 12 +; GFX8-NEXT: v_writelane_b32 v22, s54, 13 +; GFX8-NEXT: v_writelane_b32 v22, s55, 14 +; GFX8-NEXT: v_writelane_b32 v22, s30, 15 +; GFX8-NEXT: v_writelane_b32 v22, s31, 16 ; GFX8-NEXT: s_lshr_b32 s4, s32, 6 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX8-NEXT: s_add_i32 s59, s4, 0x4240 @@ -1660,27 +1222,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s30, v22, 26 -; GFX8-NEXT: v_readlane_b32 s31, v22, 27 -; GFX8-NEXT: v_readlane_b32 s59, v22, 25 -; GFX8-NEXT: v_readlane_b32 s57, v22, 24 -; GFX8-NEXT: v_readlane_b32 s56, v22, 23 -; GFX8-NEXT: v_readlane_b32 s55, v22, 22 -; GFX8-NEXT: v_readlane_b32 s54, v22, 21 -; GFX8-NEXT: v_readlane_b32 s53, v22, 20 -; GFX8-NEXT: v_readlane_b32 s52, v22, 19 -; GFX8-NEXT: v_readlane_b32 s51, v22, 18 -; GFX8-NEXT: v_readlane_b32 s50, v22, 17 -; GFX8-NEXT: v_readlane_b32 s49, v22, 16 -; GFX8-NEXT: v_readlane_b32 s48, v22, 15 -; GFX8-NEXT: v_readlane_b32 s47, v22, 14 -; GFX8-NEXT: v_readlane_b32 s46, v22, 13 -; GFX8-NEXT: v_readlane_b32 s45, v22, 12 -; GFX8-NEXT: v_readlane_b32 s44, v22, 11 -; GFX8-NEXT: v_readlane_b32 s43, v22, 10 -; GFX8-NEXT: v_readlane_b32 s42, v22, 9 -; GFX8-NEXT: v_readlane_b32 s41, v22, 8 -; GFX8-NEXT: v_readlane_b32 s40, v22, 7 +; GFX8-NEXT: v_readlane_b32 s30, v22, 15 +; GFX8-NEXT: v_readlane_b32 s31, v22, 16 +; GFX8-NEXT: v_readlane_b32 s55, v22, 14 +; GFX8-NEXT: v_readlane_b32 s54, v22, 13 +; GFX8-NEXT: v_readlane_b32 s53, v22, 12 +; GFX8-NEXT: v_readlane_b32 s52, v22, 11 +; GFX8-NEXT: v_readlane_b32 s51, v22, 10 +; GFX8-NEXT: v_readlane_b32 s50, v22, 9 +; GFX8-NEXT: v_readlane_b32 s49, v22, 8 +; GFX8-NEXT: v_readlane_b32 s48, v22, 7 ; GFX8-NEXT: v_readlane_b32 s39, v22, 6 ; GFX8-NEXT: v_readlane_b32 s38, v22, 5 ; GFX8-NEXT: v_readlane_b32 s37, v22, 4 @@ -1709,27 +1260,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX900-NEXT: v_writelane_b32 v22, s37, 4 ; GFX900-NEXT: v_writelane_b32 v22, s38, 5 ; GFX900-NEXT: v_writelane_b32 v22, s39, 6 -; GFX900-NEXT: v_writelane_b32 v22, s40, 7 -; GFX900-NEXT: v_writelane_b32 v22, s41, 8 -; GFX900-NEXT: v_writelane_b32 v22, s42, 9 -; GFX900-NEXT: v_writelane_b32 v22, s43, 10 -; GFX900-NEXT: v_writelane_b32 v22, s44, 11 -; GFX900-NEXT: v_writelane_b32 v22, s45, 12 -; GFX900-NEXT: v_writelane_b32 v22, s46, 13 -; GFX900-NEXT: v_writelane_b32 v22, s47, 14 -; GFX900-NEXT: v_writelane_b32 v22, s48, 15 -; GFX900-NEXT: v_writelane_b32 v22, s49, 16 -; GFX900-NEXT: v_writelane_b32 v22, s50, 17 -; GFX900-NEXT: v_writelane_b32 v22, s51, 18 -; GFX900-NEXT: v_writelane_b32 v22, s52, 19 -; GFX900-NEXT: v_writelane_b32 v22, s53, 20 -; GFX900-NEXT: v_writelane_b32 v22, s54, 21 -; GFX900-NEXT: v_writelane_b32 v22, s55, 22 -; GFX900-NEXT: v_writelane_b32 v22, s56, 23 -; GFX900-NEXT: v_writelane_b32 v22, s57, 24 -; GFX900-NEXT: v_writelane_b32 v22, s59, 25 -; GFX900-NEXT: v_writelane_b32 v22, s30, 26 -; GFX900-NEXT: v_writelane_b32 v22, s31, 27 +; GFX900-NEXT: v_writelane_b32 v22, s48, 7 +; GFX900-NEXT: v_writelane_b32 v22, s49, 8 +; GFX900-NEXT: v_writelane_b32 v22, s50, 9 +; GFX900-NEXT: v_writelane_b32 v22, s51, 10 +; GFX900-NEXT: v_writelane_b32 v22, s52, 11 +; GFX900-NEXT: v_writelane_b32 v22, s53, 12 +; GFX900-NEXT: v_writelane_b32 v22, s54, 13 +; GFX900-NEXT: v_writelane_b32 v22, s55, 14 +; GFX900-NEXT: v_writelane_b32 v22, s30, 15 +; GFX900-NEXT: v_writelane_b32 v22, s31, 16 ; GFX900-NEXT: s_lshr_b32 s4, s32, 6 ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: s_add_i32 s59, s4, 0x4240 @@ -1744,27 +1284,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s30, v22, 26 -; GFX900-NEXT: v_readlane_b32 s31, v22, 27 -; GFX900-NEXT: v_readlane_b32 s59, v22, 25 -; GFX900-NEXT: v_readlane_b32 s57, v22, 24 -; GFX900-NEXT: v_readlane_b32 s56, v22, 23 -; GFX900-NEXT: v_readlane_b32 s55, v22, 22 -; GFX900-NEXT: v_readlane_b32 s54, v22, 21 -; GFX900-NEXT: v_readlane_b32 s53, v22, 20 -; GFX900-NEXT: v_readlane_b32 s52, v22, 19 -; GFX900-NEXT: v_readlane_b32 s51, v22, 18 -; GFX900-NEXT: v_readlane_b32 s50, v22, 17 -; GFX900-NEXT: v_readlane_b32 s49, v22, 16 -; GFX900-NEXT: v_readlane_b32 s48, v22, 15 -; GFX900-NEXT: v_readlane_b32 s47, v22, 14 -; GFX900-NEXT: v_readlane_b32 s46, v22, 13 -; GFX900-NEXT: v_readlane_b32 s45, v22, 12 -; GFX900-NEXT: v_readlane_b32 s44, v22, 11 -; GFX900-NEXT: v_readlane_b32 s43, v22, 10 -; GFX900-NEXT: v_readlane_b32 s42, v22, 9 -; GFX900-NEXT: v_readlane_b32 s41, v22, 8 -; GFX900-NEXT: v_readlane_b32 s40, v22, 7 +; GFX900-NEXT: v_readlane_b32 s30, v22, 15 +; GFX900-NEXT: v_readlane_b32 s31, v22, 16 +; GFX900-NEXT: v_readlane_b32 s55, v22, 14 +; GFX900-NEXT: v_readlane_b32 s54, v22, 13 +; GFX900-NEXT: v_readlane_b32 s53, v22, 12 +; GFX900-NEXT: v_readlane_b32 s52, v22, 11 +; GFX900-NEXT: v_readlane_b32 s51, v22, 10 +; GFX900-NEXT: v_readlane_b32 s50, v22, 9 +; GFX900-NEXT: v_readlane_b32 s49, v22, 8 +; GFX900-NEXT: v_readlane_b32 s48, v22, 7 ; GFX900-NEXT: v_readlane_b32 s39, v22, 6 ; GFX900-NEXT: v_readlane_b32 s38, v22, 5 ; GFX900-NEXT: v_readlane_b32 s37, v22, 4 @@ -1779,93 +1308,67 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX940-NEXT: scratch_store_dword off, v22, s2 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v22, s33, 0 -; GFX940-NEXT: v_writelane_b32 v22, s34, 1 -; GFX940-NEXT: v_writelane_b32 v22, s35, 2 -; GFX940-NEXT: v_writelane_b32 v22, s36, 3 -; GFX940-NEXT: v_writelane_b32 v22, s37, 4 -; GFX940-NEXT: v_writelane_b32 v22, s38, 5 -; GFX940-NEXT: v_writelane_b32 v22, s39, 6 -; GFX940-NEXT: v_writelane_b32 v22, s40, 7 -; GFX940-NEXT: v_writelane_b32 v22, s41, 8 -; GFX940-NEXT: v_writelane_b32 v22, s42, 9 -; GFX940-NEXT: v_writelane_b32 v22, s43, 10 -; GFX940-NEXT: v_writelane_b32 v22, s44, 11 -; GFX940-NEXT: v_writelane_b32 v22, s45, 12 -; GFX940-NEXT: v_writelane_b32 v22, s46, 13 -; GFX940-NEXT: v_writelane_b32 v22, s47, 14 -; GFX940-NEXT: v_writelane_b32 v22, s48, 15 -; GFX940-NEXT: v_writelane_b32 v22, s49, 16 -; GFX940-NEXT: v_writelane_b32 v22, s50, 17 -; GFX940-NEXT: v_writelane_b32 v22, s51, 18 -; GFX940-NEXT: v_writelane_b32 v22, s52, 19 -; GFX940-NEXT: v_writelane_b32 v22, s53, 20 -; GFX940-NEXT: v_writelane_b32 v22, s54, 21 -; GFX940-NEXT: v_writelane_b32 v22, s55, 22 -; GFX940-NEXT: v_writelane_b32 v22, s56, 23 -; GFX940-NEXT: v_writelane_b32 v22, s57, 24 -; GFX940-NEXT: v_writelane_b32 v22, s59, 25 -; GFX940-NEXT: v_writelane_b32 v22, s60, 26 -; GFX940-NEXT: v_writelane_b32 v22, s61, 27 -; GFX940-NEXT: v_writelane_b32 v22, s30, 28 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v22, s31, 29 -; GFX940-NEXT: s_add_i32 s0, s32, 64 -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use alloca0 v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_add_i32 s59, s32, 0x4240 -; GFX940-NEXT: s_and_b64 s[60:61], 0, exec -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s30, v22, 28 -; GFX940-NEXT: v_readlane_b32 s31, v22, 29 -; GFX940-NEXT: v_readlane_b32 s61, v22, 27 -; GFX940-NEXT: v_readlane_b32 s60, v22, 26 -; GFX940-NEXT: v_readlane_b32 s59, v22, 25 -; GFX940-NEXT: v_readlane_b32 s57, v22, 24 -; GFX940-NEXT: v_readlane_b32 s56, v22, 23 -; GFX940-NEXT: v_readlane_b32 s55, v22, 22 -; GFX940-NEXT: v_readlane_b32 s54, v22, 21 -; GFX940-NEXT: v_readlane_b32 s53, v22, 20 -; GFX940-NEXT: v_readlane_b32 s52, v22, 19 -; GFX940-NEXT: v_readlane_b32 s51, v22, 18 -; GFX940-NEXT: v_readlane_b32 s50, v22, 17 -; GFX940-NEXT: v_readlane_b32 s49, v22, 16 -; GFX940-NEXT: v_readlane_b32 s48, v22, 15 -; GFX940-NEXT: v_readlane_b32 s47, v22, 14 -; GFX940-NEXT: v_readlane_b32 s46, v22, 13 -; GFX940-NEXT: v_readlane_b32 s45, v22, 12 -; GFX940-NEXT: v_readlane_b32 s44, v22, 11 -; GFX940-NEXT: v_readlane_b32 s43, v22, 10 -; GFX940-NEXT: v_readlane_b32 s42, v22, 9 -; GFX940-NEXT: v_readlane_b32 s41, v22, 8 -; GFX940-NEXT: v_readlane_b32 s40, v22, 7 -; GFX940-NEXT: v_readlane_b32 s39, v22, 6 -; GFX940-NEXT: v_readlane_b32 s38, v22, 5 -; GFX940-NEXT: v_readlane_b32 s37, v22, 4 -; GFX940-NEXT: v_readlane_b32 s36, v22, 3 -; GFX940-NEXT: v_readlane_b32 s35, v22, 2 -; GFX940-NEXT: v_readlane_b32 s34, v22, 1 -; GFX940-NEXT: v_readlane_b32 s33, v22, 0 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX940-NEXT: scratch_load_dword v22, off, s2 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: s_add_i32 s2, s32, 0x8040 +; GFX942-NEXT: scratch_store_dword off, v22, s2 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v22, s33, 0 +; GFX942-NEXT: v_writelane_b32 v22, s34, 1 +; GFX942-NEXT: v_writelane_b32 v22, s35, 2 +; GFX942-NEXT: v_writelane_b32 v22, s36, 3 +; GFX942-NEXT: v_writelane_b32 v22, s37, 4 +; GFX942-NEXT: v_writelane_b32 v22, s38, 5 +; GFX942-NEXT: v_writelane_b32 v22, s39, 6 +; GFX942-NEXT: v_writelane_b32 v22, s48, 7 +; GFX942-NEXT: v_writelane_b32 v22, s49, 8 +; GFX942-NEXT: v_writelane_b32 v22, s50, 9 +; GFX942-NEXT: v_writelane_b32 v22, s51, 10 +; GFX942-NEXT: v_writelane_b32 v22, s52, 11 +; GFX942-NEXT: v_writelane_b32 v22, s53, 12 +; GFX942-NEXT: v_writelane_b32 v22, s54, 13 +; GFX942-NEXT: v_writelane_b32 v22, s55, 14 +; GFX942-NEXT: v_writelane_b32 v22, s30, 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v22, s31, 16 +; GFX942-NEXT: s_add_i32 s0, s32, 64 +; GFX942-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use alloca0 v0 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_add_i32 s59, s32, 0x4240 +; GFX942-NEXT: s_and_b64 s[60:61], 0, exec +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s30, v22, 15 +; GFX942-NEXT: v_readlane_b32 s31, v22, 16 +; GFX942-NEXT: v_readlane_b32 s55, v22, 14 +; GFX942-NEXT: v_readlane_b32 s54, v22, 13 +; GFX942-NEXT: v_readlane_b32 s53, v22, 12 +; GFX942-NEXT: v_readlane_b32 s52, v22, 11 +; GFX942-NEXT: v_readlane_b32 s51, v22, 10 +; GFX942-NEXT: v_readlane_b32 s50, v22, 9 +; GFX942-NEXT: v_readlane_b32 s49, v22, 8 +; GFX942-NEXT: v_readlane_b32 s48, v22, 7 +; GFX942-NEXT: v_readlane_b32 s39, v22, 6 +; GFX942-NEXT: v_readlane_b32 s38, v22, 5 +; GFX942-NEXT: v_readlane_b32 s37, v22, 4 +; GFX942-NEXT: v_readlane_b32 s36, v22, 3 +; GFX942-NEXT: v_readlane_b32 s35, v22, 2 +; GFX942-NEXT: v_readlane_b32 s34, v22, 1 +; GFX942-NEXT: v_readlane_b32 s33, v22, 0 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: s_add_i32 s2, s32, 0x8040 +; GFX942-NEXT: scratch_load_dword v22, off, s2 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10_1-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset: ; GFX10_1: ; %bb.0: @@ -1882,27 +1385,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_1-NEXT: v_writelane_b32 v22, s37, 4 ; GFX10_1-NEXT: v_writelane_b32 v22, s38, 5 ; GFX10_1-NEXT: v_writelane_b32 v22, s39, 6 -; GFX10_1-NEXT: v_writelane_b32 v22, s40, 7 -; GFX10_1-NEXT: v_writelane_b32 v22, s41, 8 -; GFX10_1-NEXT: v_writelane_b32 v22, s42, 9 -; GFX10_1-NEXT: v_writelane_b32 v22, s43, 10 -; GFX10_1-NEXT: v_writelane_b32 v22, s44, 11 -; GFX10_1-NEXT: v_writelane_b32 v22, s45, 12 -; GFX10_1-NEXT: v_writelane_b32 v22, s46, 13 -; GFX10_1-NEXT: v_writelane_b32 v22, s47, 14 -; GFX10_1-NEXT: v_writelane_b32 v22, s48, 15 -; GFX10_1-NEXT: v_writelane_b32 v22, s49, 16 -; GFX10_1-NEXT: v_writelane_b32 v22, s50, 17 -; GFX10_1-NEXT: v_writelane_b32 v22, s51, 18 -; GFX10_1-NEXT: v_writelane_b32 v22, s52, 19 -; GFX10_1-NEXT: v_writelane_b32 v22, s53, 20 -; GFX10_1-NEXT: v_writelane_b32 v22, s54, 21 -; GFX10_1-NEXT: v_writelane_b32 v22, s55, 22 -; GFX10_1-NEXT: v_writelane_b32 v22, s56, 23 -; GFX10_1-NEXT: v_writelane_b32 v22, s57, 24 -; GFX10_1-NEXT: v_writelane_b32 v22, s59, 25 -; GFX10_1-NEXT: v_writelane_b32 v22, s30, 26 -; GFX10_1-NEXT: v_writelane_b32 v22, s31, 27 +; GFX10_1-NEXT: v_writelane_b32 v22, s48, 7 +; GFX10_1-NEXT: v_writelane_b32 v22, s49, 8 +; GFX10_1-NEXT: v_writelane_b32 v22, s50, 9 +; GFX10_1-NEXT: v_writelane_b32 v22, s51, 10 +; GFX10_1-NEXT: v_writelane_b32 v22, s52, 11 +; GFX10_1-NEXT: v_writelane_b32 v22, s53, 12 +; GFX10_1-NEXT: v_writelane_b32 v22, s54, 13 +; GFX10_1-NEXT: v_writelane_b32 v22, s55, 14 +; GFX10_1-NEXT: v_writelane_b32 v22, s30, 15 +; GFX10_1-NEXT: v_writelane_b32 v22, s31, 16 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5 ; GFX10_1-NEXT: s_add_i32 s59, s4, 0x4240 @@ -1917,27 +1409,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s30, v22, 26 -; GFX10_1-NEXT: v_readlane_b32 s31, v22, 27 -; GFX10_1-NEXT: v_readlane_b32 s59, v22, 25 -; GFX10_1-NEXT: v_readlane_b32 s57, v22, 24 -; GFX10_1-NEXT: v_readlane_b32 s56, v22, 23 -; GFX10_1-NEXT: v_readlane_b32 s55, v22, 22 -; GFX10_1-NEXT: v_readlane_b32 s54, v22, 21 -; GFX10_1-NEXT: v_readlane_b32 s53, v22, 20 -; GFX10_1-NEXT: v_readlane_b32 s52, v22, 19 -; GFX10_1-NEXT: v_readlane_b32 s51, v22, 18 -; GFX10_1-NEXT: v_readlane_b32 s50, v22, 17 -; GFX10_1-NEXT: v_readlane_b32 s49, v22, 16 -; GFX10_1-NEXT: v_readlane_b32 s48, v22, 15 -; GFX10_1-NEXT: v_readlane_b32 s47, v22, 14 -; GFX10_1-NEXT: v_readlane_b32 s46, v22, 13 -; GFX10_1-NEXT: v_readlane_b32 s45, v22, 12 -; GFX10_1-NEXT: v_readlane_b32 s44, v22, 11 -; GFX10_1-NEXT: v_readlane_b32 s43, v22, 10 -; GFX10_1-NEXT: v_readlane_b32 s42, v22, 9 -; GFX10_1-NEXT: v_readlane_b32 s41, v22, 8 -; GFX10_1-NEXT: v_readlane_b32 s40, v22, 7 +; GFX10_1-NEXT: v_readlane_b32 s30, v22, 15 +; GFX10_1-NEXT: v_readlane_b32 s31, v22, 16 +; GFX10_1-NEXT: v_readlane_b32 s55, v22, 14 +; GFX10_1-NEXT: v_readlane_b32 s54, v22, 13 +; GFX10_1-NEXT: v_readlane_b32 s53, v22, 12 +; GFX10_1-NEXT: v_readlane_b32 s52, v22, 11 +; GFX10_1-NEXT: v_readlane_b32 s51, v22, 10 +; GFX10_1-NEXT: v_readlane_b32 s50, v22, 9 +; GFX10_1-NEXT: v_readlane_b32 s49, v22, 8 +; GFX10_1-NEXT: v_readlane_b32 s48, v22, 7 ; GFX10_1-NEXT: v_readlane_b32 s39, v22, 6 ; GFX10_1-NEXT: v_readlane_b32 s38, v22, 5 ; GFX10_1-NEXT: v_readlane_b32 s37, v22, 4 @@ -1967,27 +1448,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_3-NEXT: v_writelane_b32 v22, s37, 4 ; GFX10_3-NEXT: v_writelane_b32 v22, s38, 5 ; GFX10_3-NEXT: v_writelane_b32 v22, s39, 6 -; GFX10_3-NEXT: v_writelane_b32 v22, s40, 7 -; GFX10_3-NEXT: v_writelane_b32 v22, s41, 8 -; GFX10_3-NEXT: v_writelane_b32 v22, s42, 9 -; GFX10_3-NEXT: v_writelane_b32 v22, s43, 10 -; GFX10_3-NEXT: v_writelane_b32 v22, s44, 11 -; GFX10_3-NEXT: v_writelane_b32 v22, s45, 12 -; GFX10_3-NEXT: v_writelane_b32 v22, s46, 13 -; GFX10_3-NEXT: v_writelane_b32 v22, s47, 14 -; GFX10_3-NEXT: v_writelane_b32 v22, s48, 15 -; GFX10_3-NEXT: v_writelane_b32 v22, s49, 16 -; GFX10_3-NEXT: v_writelane_b32 v22, s50, 17 -; GFX10_3-NEXT: v_writelane_b32 v22, s51, 18 -; GFX10_3-NEXT: v_writelane_b32 v22, s52, 19 -; GFX10_3-NEXT: v_writelane_b32 v22, s53, 20 -; GFX10_3-NEXT: v_writelane_b32 v22, s54, 21 -; GFX10_3-NEXT: v_writelane_b32 v22, s55, 22 -; GFX10_3-NEXT: v_writelane_b32 v22, s56, 23 -; GFX10_3-NEXT: v_writelane_b32 v22, s57, 24 -; GFX10_3-NEXT: v_writelane_b32 v22, s59, 25 -; GFX10_3-NEXT: v_writelane_b32 v22, s30, 26 -; GFX10_3-NEXT: v_writelane_b32 v22, s31, 27 +; GFX10_3-NEXT: v_writelane_b32 v22, s48, 7 +; GFX10_3-NEXT: v_writelane_b32 v22, s49, 8 +; GFX10_3-NEXT: v_writelane_b32 v22, s50, 9 +; GFX10_3-NEXT: v_writelane_b32 v22, s51, 10 +; GFX10_3-NEXT: v_writelane_b32 v22, s52, 11 +; GFX10_3-NEXT: v_writelane_b32 v22, s53, 12 +; GFX10_3-NEXT: v_writelane_b32 v22, s54, 13 +; GFX10_3-NEXT: v_writelane_b32 v22, s55, 14 +; GFX10_3-NEXT: v_writelane_b32 v22, s30, 15 +; GFX10_3-NEXT: v_writelane_b32 v22, s31, 16 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5 ; GFX10_3-NEXT: s_add_i32 s59, s4, 0x4240 @@ -2002,27 +1472,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s30, v22, 26 -; GFX10_3-NEXT: v_readlane_b32 s31, v22, 27 -; GFX10_3-NEXT: v_readlane_b32 s59, v22, 25 -; GFX10_3-NEXT: v_readlane_b32 s57, v22, 24 -; GFX10_3-NEXT: v_readlane_b32 s56, v22, 23 -; GFX10_3-NEXT: v_readlane_b32 s55, v22, 22 -; GFX10_3-NEXT: v_readlane_b32 s54, v22, 21 -; GFX10_3-NEXT: v_readlane_b32 s53, v22, 20 -; GFX10_3-NEXT: v_readlane_b32 s52, v22, 19 -; GFX10_3-NEXT: v_readlane_b32 s51, v22, 18 -; GFX10_3-NEXT: v_readlane_b32 s50, v22, 17 -; GFX10_3-NEXT: v_readlane_b32 s49, v22, 16 -; GFX10_3-NEXT: v_readlane_b32 s48, v22, 15 -; GFX10_3-NEXT: v_readlane_b32 s47, v22, 14 -; GFX10_3-NEXT: v_readlane_b32 s46, v22, 13 -; GFX10_3-NEXT: v_readlane_b32 s45, v22, 12 -; GFX10_3-NEXT: v_readlane_b32 s44, v22, 11 -; GFX10_3-NEXT: v_readlane_b32 s43, v22, 10 -; GFX10_3-NEXT: v_readlane_b32 s42, v22, 9 -; GFX10_3-NEXT: v_readlane_b32 s41, v22, 8 -; GFX10_3-NEXT: v_readlane_b32 s40, v22, 7 +; GFX10_3-NEXT: v_readlane_b32 s30, v22, 15 +; GFX10_3-NEXT: v_readlane_b32 s31, v22, 16 +; GFX10_3-NEXT: v_readlane_b32 s55, v22, 14 +; GFX10_3-NEXT: v_readlane_b32 s54, v22, 13 +; GFX10_3-NEXT: v_readlane_b32 s53, v22, 12 +; GFX10_3-NEXT: v_readlane_b32 s52, v22, 11 +; GFX10_3-NEXT: v_readlane_b32 s51, v22, 10 +; GFX10_3-NEXT: v_readlane_b32 s50, v22, 9 +; GFX10_3-NEXT: v_readlane_b32 s49, v22, 8 +; GFX10_3-NEXT: v_readlane_b32 s48, v22, 7 ; GFX10_3-NEXT: v_readlane_b32 s39, v22, 6 ; GFX10_3-NEXT: v_readlane_b32 s38, v22, 5 ; GFX10_3-NEXT: v_readlane_b32 s37, v22, 4 @@ -2051,27 +1510,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX11-NEXT: v_writelane_b32 v22, s37, 4 ; GFX11-NEXT: v_writelane_b32 v22, s38, 5 ; GFX11-NEXT: v_writelane_b32 v22, s39, 6 -; GFX11-NEXT: v_writelane_b32 v22, s40, 7 -; GFX11-NEXT: v_writelane_b32 v22, s41, 8 -; GFX11-NEXT: v_writelane_b32 v22, s42, 9 -; GFX11-NEXT: v_writelane_b32 v22, s43, 10 -; GFX11-NEXT: v_writelane_b32 v22, s44, 11 -; GFX11-NEXT: v_writelane_b32 v22, s45, 12 -; GFX11-NEXT: v_writelane_b32 v22, s46, 13 -; GFX11-NEXT: v_writelane_b32 v22, s47, 14 -; GFX11-NEXT: v_writelane_b32 v22, s48, 15 -; GFX11-NEXT: v_writelane_b32 v22, s49, 16 -; GFX11-NEXT: v_writelane_b32 v22, s50, 17 -; GFX11-NEXT: v_writelane_b32 v22, s51, 18 -; GFX11-NEXT: v_writelane_b32 v22, s52, 19 -; GFX11-NEXT: v_writelane_b32 v22, s53, 20 -; GFX11-NEXT: v_writelane_b32 v22, s54, 21 -; GFX11-NEXT: v_writelane_b32 v22, s55, 22 -; GFX11-NEXT: v_writelane_b32 v22, s56, 23 -; GFX11-NEXT: v_writelane_b32 v22, s57, 24 -; GFX11-NEXT: v_writelane_b32 v22, s59, 25 -; GFX11-NEXT: v_writelane_b32 v22, s30, 26 -; GFX11-NEXT: v_writelane_b32 v22, s31, 27 +; GFX11-NEXT: v_writelane_b32 v22, s48, 7 +; GFX11-NEXT: v_writelane_b32 v22, s49, 8 +; GFX11-NEXT: v_writelane_b32 v22, s50, 9 +; GFX11-NEXT: v_writelane_b32 v22, s51, 10 +; GFX11-NEXT: v_writelane_b32 v22, s52, 11 +; GFX11-NEXT: v_writelane_b32 v22, s53, 12 +; GFX11-NEXT: v_writelane_b32 v22, s54, 13 +; GFX11-NEXT: v_writelane_b32 v22, s55, 14 +; GFX11-NEXT: v_writelane_b32 v22, s30, 15 +; GFX11-NEXT: v_writelane_b32 v22, s31, 16 ; GFX11-NEXT: s_add_i32 s0, s32, 64 ; GFX11-NEXT: s_add_i32 s59, s32, 0x4240 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -2085,27 +1533,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s30, v22, 26 -; GFX11-NEXT: v_readlane_b32 s31, v22, 27 -; GFX11-NEXT: v_readlane_b32 s59, v22, 25 -; GFX11-NEXT: v_readlane_b32 s57, v22, 24 -; GFX11-NEXT: v_readlane_b32 s56, v22, 23 -; GFX11-NEXT: v_readlane_b32 s55, v22, 22 -; GFX11-NEXT: v_readlane_b32 s54, v22, 21 -; GFX11-NEXT: v_readlane_b32 s53, v22, 20 -; GFX11-NEXT: v_readlane_b32 s52, v22, 19 -; GFX11-NEXT: v_readlane_b32 s51, v22, 18 -; GFX11-NEXT: v_readlane_b32 s50, v22, 17 -; GFX11-NEXT: v_readlane_b32 s49, v22, 16 -; GFX11-NEXT: v_readlane_b32 s48, v22, 15 -; GFX11-NEXT: v_readlane_b32 s47, v22, 14 -; GFX11-NEXT: v_readlane_b32 s46, v22, 13 -; GFX11-NEXT: v_readlane_b32 s45, v22, 12 -; GFX11-NEXT: v_readlane_b32 s44, v22, 11 -; GFX11-NEXT: v_readlane_b32 s43, v22, 10 -; GFX11-NEXT: v_readlane_b32 s42, v22, 9 -; GFX11-NEXT: v_readlane_b32 s41, v22, 8 -; GFX11-NEXT: v_readlane_b32 s40, v22, 7 +; GFX11-NEXT: v_readlane_b32 s30, v22, 15 +; GFX11-NEXT: v_readlane_b32 s31, v22, 16 +; GFX11-NEXT: v_readlane_b32 s55, v22, 14 +; GFX11-NEXT: v_readlane_b32 s54, v22, 13 +; GFX11-NEXT: v_readlane_b32 s53, v22, 12 +; GFX11-NEXT: v_readlane_b32 s52, v22, 11 +; GFX11-NEXT: v_readlane_b32 s51, v22, 10 +; GFX11-NEXT: v_readlane_b32 s50, v22, 9 +; GFX11-NEXT: v_readlane_b32 s49, v22, 8 +; GFX11-NEXT: v_readlane_b32 s48, v22, 7 ; GFX11-NEXT: v_readlane_b32 s39, v22, 6 ; GFX11-NEXT: v_readlane_b32 s38, v22, 5 ; GFX11-NEXT: v_readlane_b32 s37, v22, 4 @@ -2138,27 +1575,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX12-NEXT: v_writelane_b32 v22, s37, 4 ; GFX12-NEXT: v_writelane_b32 v22, s38, 5 ; GFX12-NEXT: v_writelane_b32 v22, s39, 6 -; GFX12-NEXT: v_writelane_b32 v22, s40, 7 -; GFX12-NEXT: v_writelane_b32 v22, s41, 8 -; GFX12-NEXT: v_writelane_b32 v22, s42, 9 -; GFX12-NEXT: v_writelane_b32 v22, s43, 10 -; GFX12-NEXT: v_writelane_b32 v22, s44, 11 -; GFX12-NEXT: v_writelane_b32 v22, s45, 12 -; GFX12-NEXT: v_writelane_b32 v22, s46, 13 -; GFX12-NEXT: v_writelane_b32 v22, s47, 14 -; GFX12-NEXT: v_writelane_b32 v22, s48, 15 -; GFX12-NEXT: v_writelane_b32 v22, s49, 16 -; GFX12-NEXT: v_writelane_b32 v22, s50, 17 -; GFX12-NEXT: v_writelane_b32 v22, s51, 18 -; GFX12-NEXT: v_writelane_b32 v22, s52, 19 -; GFX12-NEXT: v_writelane_b32 v22, s53, 20 -; GFX12-NEXT: v_writelane_b32 v22, s54, 21 -; GFX12-NEXT: v_writelane_b32 v22, s55, 22 -; GFX12-NEXT: v_writelane_b32 v22, s56, 23 -; GFX12-NEXT: v_writelane_b32 v22, s57, 24 -; GFX12-NEXT: v_writelane_b32 v22, s59, 25 -; GFX12-NEXT: v_writelane_b32 v22, s30, 26 -; GFX12-NEXT: v_writelane_b32 v22, s31, 27 +; GFX12-NEXT: v_writelane_b32 v22, s48, 7 +; GFX12-NEXT: v_writelane_b32 v22, s49, 8 +; GFX12-NEXT: v_writelane_b32 v22, s50, 9 +; GFX12-NEXT: v_writelane_b32 v22, s51, 10 +; GFX12-NEXT: v_writelane_b32 v22, s52, 11 +; GFX12-NEXT: v_writelane_b32 v22, s53, 12 +; GFX12-NEXT: v_writelane_b32 v22, s54, 13 +; GFX12-NEXT: v_writelane_b32 v22, s55, 14 +; GFX12-NEXT: v_writelane_b32 v22, s30, 15 +; GFX12-NEXT: v_writelane_b32 v22, s31, 16 ; GFX12-NEXT: s_add_co_i32 s59, s32, 0x4200 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo @@ -2171,27 +1597,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s30, v22, 26 -; GFX12-NEXT: v_readlane_b32 s31, v22, 27 -; GFX12-NEXT: v_readlane_b32 s59, v22, 25 -; GFX12-NEXT: v_readlane_b32 s57, v22, 24 -; GFX12-NEXT: v_readlane_b32 s56, v22, 23 -; GFX12-NEXT: v_readlane_b32 s55, v22, 22 -; GFX12-NEXT: v_readlane_b32 s54, v22, 21 -; GFX12-NEXT: v_readlane_b32 s53, v22, 20 -; GFX12-NEXT: v_readlane_b32 s52, v22, 19 -; GFX12-NEXT: v_readlane_b32 s51, v22, 18 -; GFX12-NEXT: v_readlane_b32 s50, v22, 17 -; GFX12-NEXT: v_readlane_b32 s49, v22, 16 -; GFX12-NEXT: v_readlane_b32 s48, v22, 15 -; GFX12-NEXT: v_readlane_b32 s47, v22, 14 -; GFX12-NEXT: v_readlane_b32 s46, v22, 13 -; GFX12-NEXT: v_readlane_b32 s45, v22, 12 -; GFX12-NEXT: v_readlane_b32 s44, v22, 11 -; GFX12-NEXT: v_readlane_b32 s43, v22, 10 -; GFX12-NEXT: v_readlane_b32 s42, v22, 9 -; GFX12-NEXT: v_readlane_b32 s41, v22, 8 -; GFX12-NEXT: v_readlane_b32 s40, v22, 7 +; GFX12-NEXT: v_readlane_b32 s30, v22, 15 +; GFX12-NEXT: v_readlane_b32 s31, v22, 16 +; GFX12-NEXT: v_readlane_b32 s55, v22, 14 +; GFX12-NEXT: v_readlane_b32 s54, v22, 13 +; GFX12-NEXT: v_readlane_b32 s53, v22, 12 +; GFX12-NEXT: v_readlane_b32 s52, v22, 11 +; GFX12-NEXT: v_readlane_b32 s51, v22, 10 +; GFX12-NEXT: v_readlane_b32 s50, v22, 9 +; GFX12-NEXT: v_readlane_b32 s49, v22, 8 +; GFX12-NEXT: v_readlane_b32 s48, v22, 7 ; GFX12-NEXT: v_readlane_b32 s39, v22, 6 ; GFX12-NEXT: v_readlane_b32 s38, v22, 5 ; GFX12-NEXT: v_readlane_b32 s37, v22, 4 @@ -2204,7 +1619,6 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [4096 x i32], align 64, addrspace(5) %alloca1 = alloca [4096 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index fa15a42aef2ac..64afe3cd01255 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -278,17 +278,20 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -296,19 +299,22 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y) @@ -391,15 +397,19 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll index 0112453e32bfc..52f380b7f80a3 100644 --- a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll +++ b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: I_Quit: ; CHECK: .set I_Quit.num_vgpr, max(41, amdgpu.max_num_vgpr) ; CHECK: .set I_Quit.num_agpr, max(0, amdgpu.max_num_agpr) -; CHECK: .set I_Quit.numbered_sgpr, max(48, amdgpu.max_num_sgpr) +; CHECK: .set I_Quit.numbered_sgpr, max(56, amdgpu.max_num_sgpr) ; CHECK: .set I_Quit.private_seg_size, 16 ; CHECK: .set I_Quit.uses_vcc, 1 ; CHECK: .set I_Quit.uses_flat_scratch, 1 @@ -80,7 +80,7 @@ define void @P_SetThingPosition() { ; CHECK-LABEL: P_SetupPsprites: ; CHECK: .set P_SetupPsprites.num_vgpr, max(41, amdgpu.max_num_vgpr) ; CHECK: .set P_SetupPsprites.num_agpr, max(0, amdgpu.max_num_agpr) -; CHECK: .set P_SetupPsprites.numbered_sgpr, max(48, amdgpu.max_num_sgpr) +; CHECK: .set P_SetupPsprites.numbered_sgpr, max(56, amdgpu.max_num_sgpr) ; CHECK: .set P_SetupPsprites.private_seg_size, 16 ; CHECK: .set P_SetupPsprites.uses_vcc, 1 ; CHECK: .set P_SetupPsprites.uses_flat_scratch, 1 @@ -110,7 +110,7 @@ define void @HU_Start() { ; CHECK-LABEL: P_SpawnPlayer: ; CHECK: .set P_SpawnPlayer.num_vgpr, max(43, G_PlayerReborn.num_vgpr, P_SetThingPosition.num_vgpr, P_SetupPsprites.num_vgpr, HU_Start.num_vgpr) ; CHECK: .set P_SpawnPlayer.num_agpr, max(0, G_PlayerReborn.num_agpr, P_SetThingPosition.num_agpr, P_SetupPsprites.num_agpr, HU_Start.num_agpr) -; CHECK: .set P_SpawnPlayer.numbered_sgpr, max(60, G_PlayerReborn.numbered_sgpr, P_SetThingPosition.numbered_sgpr, P_SetupPsprites.numbered_sgpr, HU_Start.numbered_sgpr) +; CHECK: .set P_SpawnPlayer.numbered_sgpr, max(84, G_PlayerReborn.numbered_sgpr, P_SetThingPosition.numbered_sgpr, P_SetupPsprites.numbered_sgpr, HU_Start.numbered_sgpr) ; CHECK: .set P_SpawnPlayer.private_seg_size, 16+(max(G_PlayerReborn.private_seg_size, P_SetThingPosition.private_seg_size, P_SetupPsprites.private_seg_size, HU_Start.private_seg_size)) ; CHECK: .set P_SpawnPlayer.uses_vcc, or(1, G_PlayerReborn.uses_vcc, P_SetThingPosition.uses_vcc, P_SetupPsprites.uses_vcc, HU_Start.uses_vcc) ; CHECK: .set P_SpawnPlayer.uses_flat_scratch, or(0, G_PlayerReborn.uses_flat_scratch, P_SetThingPosition.uses_flat_scratch, P_SetupPsprites.uses_flat_scratch, HU_Start.uses_flat_scratch) @@ -128,7 +128,7 @@ define void @P_SpawnPlayer() { ; CHECK-LABEL: I_Error: ; CHECK: .set I_Error.num_vgpr, max(41, amdgpu.max_num_vgpr) ; CHECK: .set I_Error.num_agpr, max(0, amdgpu.max_num_agpr) -; CHECK: .set I_Error.numbered_sgpr, max(48, amdgpu.max_num_sgpr) +; CHECK: .set I_Error.numbered_sgpr, max(56, amdgpu.max_num_sgpr) ; CHECK: .set I_Error.private_seg_size, 16 ; CHECK: .set I_Error.uses_vcc, 1 ; CHECK: .set I_Error.uses_flat_scratch, 1 @@ -144,7 +144,7 @@ define void @I_Error(...) { ; CHECK-LABEL: G_DoReborn: ; CHECK: .set G_DoReborn.num_vgpr, max(44, P_RemoveMobj.num_vgpr, P_SpawnMobj.num_vgpr, P_SpawnPlayer.num_vgpr, I_Error.num_vgpr) ; CHECK: .set G_DoReborn.num_agpr, max(0, P_RemoveMobj.num_agpr, P_SpawnMobj.num_agpr, P_SpawnPlayer.num_agpr, I_Error.num_agpr) -; CHECK: .set G_DoReborn.numbered_sgpr, max(72, P_RemoveMobj.numbered_sgpr, P_SpawnMobj.numbered_sgpr, P_SpawnPlayer.numbered_sgpr, I_Error.numbered_sgpr) +; CHECK: .set G_DoReborn.numbered_sgpr, max(104, P_RemoveMobj.numbered_sgpr, P_SpawnMobj.numbered_sgpr, P_SpawnPlayer.numbered_sgpr, I_Error.numbered_sgpr) ; CHECK: .set G_DoReborn.private_seg_size, 32+(max(P_RemoveMobj.private_seg_size, P_SpawnMobj.private_seg_size, P_SpawnPlayer.private_seg_size, I_Error.private_seg_size)) ; CHECK: .set G_DoReborn.uses_vcc, or(1, P_RemoveMobj.uses_vcc, P_SpawnMobj.uses_vcc, P_SpawnPlayer.uses_vcc, I_Error.uses_vcc) ; CHECK: .set G_DoReborn.uses_flat_scratch, or(0, P_RemoveMobj.uses_flat_scratch, P_SpawnMobj.uses_flat_scratch, P_SpawnPlayer.uses_flat_scratch, I_Error.uses_flat_scratch) @@ -218,7 +218,7 @@ define void @F_Ticker() { ; CHECK-LABEL: G_CheckDemoStatus: ; CHECK: .set G_CheckDemoStatus.num_vgpr, max(43, I_Quit.num_vgpr, D_AdvanceDemo.num_vgpr, I_Error.num_vgpr) ; CHECK: .set G_CheckDemoStatus.num_agpr, max(0, I_Quit.num_agpr, D_AdvanceDemo.num_agpr, I_Error.num_agpr) -; CHECK: .set G_CheckDemoStatus.numbered_sgpr, max(60, I_Quit.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, I_Error.numbered_sgpr) +; CHECK: .set G_CheckDemoStatus.numbered_sgpr, max(84, I_Quit.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, I_Error.numbered_sgpr) ; CHECK: .set G_CheckDemoStatus.private_seg_size, 32+(max(I_Quit.private_seg_size, D_AdvanceDemo.private_seg_size, I_Error.private_seg_size)) ; CHECK: .set G_CheckDemoStatus.uses_vcc, or(1, I_Quit.uses_vcc, D_AdvanceDemo.uses_vcc, I_Error.uses_vcc) ; CHECK: .set G_CheckDemoStatus.uses_flat_scratch, or(0, I_Quit.uses_flat_scratch, D_AdvanceDemo.uses_flat_scratch, I_Error.uses_flat_scratch) @@ -264,7 +264,7 @@ define ptr @P_SaveGameFile() { ; CHECK-LABEL: R_FlatNumForName: ; CHECK: .set R_FlatNumForName.num_vgpr, max(42, I_Error.num_vgpr) ; CHECK: .set R_FlatNumForName.num_agpr, max(0, I_Error.num_agpr) -; CHECK: .set R_FlatNumForName.numbered_sgpr, max(48, I_Error.numbered_sgpr) +; CHECK: .set R_FlatNumForName.numbered_sgpr, max(56, I_Error.numbered_sgpr) ; CHECK: .set R_FlatNumForName.private_seg_size, 16+(max(I_Error.private_seg_size)) ; CHECK: .set R_FlatNumForName.uses_vcc, or(1, I_Error.uses_vcc) ; CHECK: .set R_FlatNumForName.uses_flat_scratch, or(0, I_Error.uses_flat_scratch) @@ -279,7 +279,7 @@ define i32 @R_FlatNumForName() { ; CHECK-LABEL: R_TextureNumForName: ; CHECK: .set R_TextureNumForName.num_vgpr, max(42, R_FlatNumForName.num_vgpr) ; CHECK: .set R_TextureNumForName.num_agpr, max(0, R_FlatNumForName.num_agpr) -; CHECK: .set R_TextureNumForName.numbered_sgpr, max(48, R_FlatNumForName.numbered_sgpr) +; CHECK: .set R_TextureNumForName.numbered_sgpr, max(56, R_FlatNumForName.numbered_sgpr) ; CHECK: .set R_TextureNumForName.private_seg_size, 16+(max(R_FlatNumForName.private_seg_size)) ; CHECK: .set R_TextureNumForName.uses_vcc, or(1, R_FlatNumForName.uses_vcc) ; CHECK: .set R_TextureNumForName.uses_flat_scratch, or(0, R_FlatNumForName.uses_flat_scratch) @@ -292,10 +292,10 @@ define i32 @R_TextureNumForName() { } ; CHECK-LABEL: G_Ticker: -; CHECK: .set G_Ticker.num_vgpr, max(46, G_DoReborn.num_vgpr, F_Ticker.num_vgpr, AM_Stop.num_vgpr, F_StartFinale.num_vgpr, D_AdvanceDemo.num_vgpr, R_FlatNumForName.num_vgpr, R_TextureNumForName.num_vgpr, P_TempSaveGameFile.num_vgpr, P_SaveGameFile.num_vgpr, I_Error.num_vgpr) +; CHECK: .set G_Ticker.num_vgpr, max(47, G_DoReborn.num_vgpr, F_Ticker.num_vgpr, AM_Stop.num_vgpr, F_StartFinale.num_vgpr, D_AdvanceDemo.num_vgpr, R_FlatNumForName.num_vgpr, R_TextureNumForName.num_vgpr, P_TempSaveGameFile.num_vgpr, P_SaveGameFile.num_vgpr, I_Error.num_vgpr) ; CHECK: .set G_Ticker.num_agpr, max(0, G_DoReborn.num_agpr, F_Ticker.num_agpr, AM_Stop.num_agpr, F_StartFinale.num_agpr, D_AdvanceDemo.num_agpr, R_FlatNumForName.num_agpr, R_TextureNumForName.num_agpr, P_TempSaveGameFile.num_agpr, P_SaveGameFile.num_agpr, I_Error.num_agpr) -; CHECK: .set G_Ticker.numbered_sgpr, max(84, G_DoReborn.numbered_sgpr, F_Ticker.numbered_sgpr, AM_Stop.numbered_sgpr, F_StartFinale.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, R_FlatNumForName.numbered_sgpr, R_TextureNumForName.numbered_sgpr, P_TempSaveGameFile.numbered_sgpr, P_SaveGameFile.numbered_sgpr, I_Error.numbered_sgpr) -; CHECK: .set G_Ticker.private_seg_size, 32+(max(G_DoReborn.private_seg_size, F_Ticker.private_seg_size, AM_Stop.private_seg_size, F_StartFinale.private_seg_size, D_AdvanceDemo.private_seg_size, R_FlatNumForName.private_seg_size, R_TextureNumForName.private_seg_size, P_TempSaveGameFile.private_seg_size, P_SaveGameFile.private_seg_size, I_Error.private_seg_size)) +; CHECK: .set G_Ticker.numbered_sgpr, max(105, G_DoReborn.numbered_sgpr, F_Ticker.numbered_sgpr, AM_Stop.numbered_sgpr, F_StartFinale.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, R_FlatNumForName.numbered_sgpr, R_TextureNumForName.numbered_sgpr, P_TempSaveGameFile.numbered_sgpr, P_SaveGameFile.numbered_sgpr, I_Error.numbered_sgpr) +; CHECK: .set G_Ticker.private_seg_size, 48+(max(G_DoReborn.private_seg_size, F_Ticker.private_seg_size, AM_Stop.private_seg_size, F_StartFinale.private_seg_size, D_AdvanceDemo.private_seg_size, R_FlatNumForName.private_seg_size, R_TextureNumForName.private_seg_size, P_TempSaveGameFile.private_seg_size, P_SaveGameFile.private_seg_size, I_Error.private_seg_size)) ; CHECK: .set G_Ticker.uses_vcc, or(1, G_DoReborn.uses_vcc, F_Ticker.uses_vcc, AM_Stop.uses_vcc, F_StartFinale.uses_vcc, D_AdvanceDemo.uses_vcc, R_FlatNumForName.uses_vcc, R_TextureNumForName.uses_vcc, P_TempSaveGameFile.uses_vcc, P_SaveGameFile.uses_vcc, I_Error.uses_vcc) ; CHECK: .set G_Ticker.uses_flat_scratch, or(0, G_DoReborn.uses_flat_scratch, F_Ticker.uses_flat_scratch, AM_Stop.uses_flat_scratch, F_StartFinale.uses_flat_scratch, D_AdvanceDemo.uses_flat_scratch, R_FlatNumForName.uses_flat_scratch, R_TextureNumForName.uses_flat_scratch, P_TempSaveGameFile.uses_flat_scratch, P_SaveGameFile.uses_flat_scratch, I_Error.uses_flat_scratch) ; CHECK: .set G_Ticker.has_dyn_sized_stack, or(0, G_DoReborn.has_dyn_sized_stack, F_Ticker.has_dyn_sized_stack, AM_Stop.has_dyn_sized_stack, F_StartFinale.has_dyn_sized_stack, D_AdvanceDemo.has_dyn_sized_stack, R_FlatNumForName.has_dyn_sized_stack, R_TextureNumForName.has_dyn_sized_stack, P_TempSaveGameFile.has_dyn_sized_stack, P_SaveGameFile.has_dyn_sized_stack, I_Error.has_dyn_sized_stack) @@ -316,9 +316,9 @@ define void @G_Ticker() { } ; CHECK-LABEL: RunTic: -; CHECK: .set RunTic.num_vgpr, max(46, G_CheckDemoStatus.num_vgpr, D_AdvanceDemo.num_vgpr, G_Ticker.num_vgpr) +; CHECK: .set RunTic.num_vgpr, max(47, G_CheckDemoStatus.num_vgpr, D_AdvanceDemo.num_vgpr, G_Ticker.num_vgpr) ; CHECK: .set RunTic.num_agpr, max(0, G_CheckDemoStatus.num_agpr, D_AdvanceDemo.num_agpr, G_Ticker.num_agpr) -; CHECK: .set RunTic.numbered_sgpr, max(84, G_CheckDemoStatus.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, G_Ticker.numbered_sgpr) +; CHECK: .set RunTic.numbered_sgpr, max(105, G_CheckDemoStatus.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, G_Ticker.numbered_sgpr) ; CHECK: .set RunTic.private_seg_size, 32+(max(G_CheckDemoStatus.private_seg_size, D_AdvanceDemo.private_seg_size, G_Ticker.private_seg_size)) ; CHECK: .set RunTic.uses_vcc, or(1, G_CheckDemoStatus.uses_vcc, D_AdvanceDemo.uses_vcc, G_Ticker.uses_vcc) ; CHECK: .set RunTic.uses_flat_scratch, or(0, G_CheckDemoStatus.uses_flat_scratch, D_AdvanceDemo.uses_flat_scratch, G_Ticker.uses_flat_scratch) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index ab485b1799470..a00af8e5b6582 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -26,29 +26,26 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_load_1: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-NEXT: s_mov_b32 s2, 2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-NEXT: s_mov_b32 s2, 0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: ; implicit-def: $sgpr2 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s3, s4 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-NEXT: s_mov_b32 s2, s5 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_add_co_u32 v0, s3, s3, v0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 ; GFX12-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-NEXT: v_mov_b32_e32 v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index ddc4673a290fe..df5b45dea0c2f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -463,29 +463,26 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-WGP-LABEL: flat_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: ; implicit-def: $sgpr2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, s4 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-WGP-NEXT: s_mov_b32 s2, s5 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 ; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 @@ -499,29 +496,26 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-LABEL: flat_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: ; implicit-def: $sgpr2 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, s4 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-CU-NEXT: s_mov_b32 s2, s5 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 ; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 @@ -991,18 +985,17 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX12-WGP-NEXT: s_mov_b32 s0, 0 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: ; implicit-def: $sgpr0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-WGP-NEXT: s_mov_b32 s1, s2 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-WGP-NEXT: s_mov_b32 s0, s3 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1 ; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3 @@ -1025,18 +1018,17 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX12-CU-NEXT: s_mov_b32 s0, 0 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: ; implicit-def: $sgpr0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-CU-NEXT: s_mov_b32 s1, s2 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-CU-NEXT: s_mov_b32 s0, s3 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1 ; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index f10715033e433..e1f82a70b4c0a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -325,29 +325,26 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-WGP-LABEL: flat_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: ; implicit-def: $sgpr2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, s4 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-WGP-NEXT: s_mov_b32 s2, s5 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 ; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2 @@ -364,29 +361,26 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-LABEL: flat_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: ; implicit-def: $sgpr2 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, s4 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1 ; GFX12-CU-NEXT: s_mov_b32 s2, s5 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3 ; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2 @@ -733,18 +727,17 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX12-WGP-NEXT: s_mov_b32 s0, 0 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: ; implicit-def: $sgpr0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-WGP-NEXT: s_mov_b32 s1, s2 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-WGP-NEXT: s_mov_b32 s0, s3 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff ; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1 ; GFX12-WGP-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v3 @@ -772,18 +765,17 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 ; GFX12-CU-NEXT: s_mov_b32 s0, 0 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: ; implicit-def: $sgpr0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-CU-NEXT: s_mov_b32 s1, s2 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-CU-NEXT: s_mov_b32 s0, s3 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff ; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1 ; GFX12-CU-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index 465626078f6c6..5f952b98041f3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -28,7 +28,6 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-NEXT: s_mov_b32 s4, 2 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -73,7 +72,6 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-NEXT: s_mov_b32 s4, 2 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index f06118a7a6dc9..ebcc900307c46 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -416,7 +416,6 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-WGP-NEXT: s_mov_b32 s4, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -434,7 +433,6 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-CU-NEXT: s_mov_b32 s4, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -832,7 +830,6 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s3 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -849,7 +846,6 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s3 ; GFX12-CU-NEXT: s_mov_b32 s3, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 2bf2e03cb0bd7..7dfd5e60c24f8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -309,7 +309,6 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-WGP-NEXT: s_mov_b32 s4, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -329,7 +328,6 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX12-CU-NEXT: s_mov_b32 s4, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -647,7 +645,6 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s3 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -670,7 +667,6 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s3 ; GFX12-CU-NEXT: s_mov_b32 s3, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 3c485af18166f..e9be38d6d17a3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -396,11 +396,10 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -414,11 +413,10 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -774,7 +772,6 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1 ; GFX12-WGP-NEXT: s_mov_b32 s1, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -791,7 +788,6 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1 ; GFX12-CU-NEXT: s_mov_b32 s1, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 9740e0ae1d167..9e5f5fcffca9f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -276,11 +276,10 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 ; GFX12-WGP-NEXT: ds_load_b32 v1, v1 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0 @@ -294,11 +293,10 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 ; GFX12-CU-NEXT: ds_load_b32 v1, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -552,7 +550,6 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1 ; GFX12-WGP-NEXT: s_mov_b32 s1, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -574,7 +571,6 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1 ; GFX12-CU-NEXT: s_mov_b32 s1, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll index f988a4d33add9..bc905fa564f8a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll @@ -27,7 +27,6 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-NEXT: s_mov_b32 s3, 2 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 71f28efd47811..6feab49ed86b6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -418,7 +418,6 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -436,7 +435,6 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-CU-NEXT: s_mov_b32 s3, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -818,7 +816,6 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -835,7 +832,6 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index 3346a034f963f..f8fb7986938f2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -304,7 +304,6 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-WGP-NEXT: s_mov_b32 s3, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -324,7 +323,6 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3 ; GFX12-CU-NEXT: s_mov_b32 s3, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -624,7 +622,6 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -647,7 +644,6 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll index 7059e80d5c3d1..db1399cc74dc6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s ; Effectively, check that the compile finishes; in the case diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir index 73b114b6d0969..9c02c935b9498 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir @@ -50,7 +50,7 @@ body: | $m0 = S_MOV_B32 -1 %2:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load (s32)) - %20:sgpr_32 = V_READFIRSTLANE_B32 %2, implicit $exec + %20:sreg_32_xm0 = V_READFIRSTLANE_B32 %2, implicit $exec %21:sgpr_32 = S_ADD_U32 %20, 4, implicit-def $scc ; The S_ADDC_U32 depends on the first DS_READ_B32 only via SCC diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index f5fb85d63b8e4..a32b3b71cd606 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -280,17 +280,20 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -298,19 +301,22 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y) @@ -395,15 +401,19 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 ; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll index 856cf61127849..91964ab0e91a6 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll @@ -31,22 +31,22 @@ define amdgpu_ps float @vimage_move_to_valu(<8 x i32> %rsrc) { ; GFX11-NEXT: bb.1: ; GFX11-NEXT: successors: %bb.2(0x80000000) ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX11-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX11-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX11-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub4, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub5, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub4, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub5, implicit $exec ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1 ; GFX11-NEXT: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub4_sub5, implicit $exec ; GFX11-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U64_e64_2]], implicit-def $scc - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub6, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub7, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub6, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub7, implicit $exec ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1 ; GFX11-NEXT: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE]].sub6_sub7, implicit $exec ; GFX11-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_1]], [[V_CMP_EQ_U64_e64_3]], implicit-def $scc @@ -93,22 +93,22 @@ define amdgpu_ps float @vimage_move_to_valu(<8 x i32> %rsrc) { ; GFX12-NEXT: bb.1: ; GFX12-NEXT: successors: %bb.2(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub4, implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub5, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub4, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub5, implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1 ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub4_sub5, implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U64_e64_2]], implicit-def $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub6, implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub7, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub6, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub7, implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1 ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE]].sub6_sub7, implicit $exec ; GFX12-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_1]], [[V_CMP_EQ_U64_e64_3]], implicit-def $scc @@ -165,22 +165,22 @@ define amdgpu_ps float @vsample_move_to_valu_rsrc(<8 x i32> %rsrc, <4 x i32> inr ; GFX11-NEXT: bb.1: ; GFX11-NEXT: successors: %bb.2(0x80000000) ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX11-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX11-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX11-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub4, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub5, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub4, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub5, implicit $exec ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1 ; GFX11-NEXT: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE]].sub4_sub5, implicit $exec ; GFX11-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U64_e64_2]], implicit-def $scc - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub6, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub7, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub6, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub7, implicit $exec ; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1 ; GFX11-NEXT: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE]].sub6_sub7, implicit $exec ; GFX11-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_1]], [[V_CMP_EQ_U64_e64_3]], implicit-def $scc @@ -232,22 +232,22 @@ define amdgpu_ps float @vsample_move_to_valu_rsrc(<8 x i32> %rsrc, <4 x i32> inr ; GFX12-NEXT: bb.1: ; GFX12-NEXT: successors: %bb.2(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub4, implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub5, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub4, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub5, implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1 ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE]].sub4_sub5, implicit $exec ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U64_e64_2]], implicit-def $scc - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub6, implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub7, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub6, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub7, implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1 ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE]].sub6_sub7, implicit $exec ; GFX12-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_1]], [[V_CMP_EQ_U64_e64_3]], implicit-def $scc @@ -300,12 +300,12 @@ define amdgpu_ps float @vsample_move_to_valu_samp(<8 x i32> inreg %rsrc, <4 x i3 ; GFX11-NEXT: bb.1: ; GFX11-NEXT: successors: %bb.2(0x80000000) ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub0, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub1, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub0, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub1, implicit $exec ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX11-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE1]].sub0_sub1, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub2, implicit $exec - ; GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub3, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub2, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub3, implicit $exec ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX11-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE1]].sub2_sub3, implicit $exec ; GFX11-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -353,12 +353,12 @@ define amdgpu_ps float @vsample_move_to_valu_samp(<8 x i32> inreg %rsrc, <4 x i3 ; GFX12-NEXT: bb.1: ; GFX12-NEXT: successors: %bb.2(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub0, implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub1, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub0, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub1, implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE1]].sub0_sub1, implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub2, implicit $exec - ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub3, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub2, implicit $exec + ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub3, implicit $exec ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE1]].sub2_sub3, implicit $exec ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir index c702de6285d9b..f9dd736dd1454 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -46,12 +46,12 @@ body: | ; W64-NEXT: .1: ; W64-NEXT: successors: %bb.2(0x80000000) ; W64-NEXT: {{ $}} - ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; W64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; W64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; W64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; W64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; W64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -91,12 +91,12 @@ body: | ; W32-NEXT: .1: ; W32-NEXT: successors: %bb.2(0x80000000) ; W32-NEXT: {{ $}} - ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -161,12 +161,12 @@ body: | ; W64-NEXT: .1: ; W64-NEXT: successors: %bb.2(0x80000000) ; W64-NEXT: {{ $}} - ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; W64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; W64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; W64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; W64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; W64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -206,12 +206,12 @@ body: | ; W32-NEXT: .1: ; W32-NEXT: successors: %bb.2(0x80000000) ; W32-NEXT: {{ $}} - ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -276,12 +276,12 @@ body: | ; W64-NEXT: .1: ; W64-NEXT: successors: %bb.2(0x80000000) ; W64-NEXT: {{ $}} - ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; W64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; W64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; W64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; W64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; W64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -321,12 +321,12 @@ body: | ; W32-NEXT: .1: ; W32-NEXT: successors: %bb.2(0x80000000) ; W32-NEXT: {{ $}} - ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -496,12 +496,12 @@ body: | ; W64-NO-ADDR64-NEXT: .1: ; W64-NO-ADDR64-NEXT: successors: %bb.2(0x80000000) ; W64-NO-ADDR64-NEXT: {{ $}} - ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; W64-NO-ADDR64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; W64-NO-ADDR64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; W64-NO-ADDR64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc @@ -541,12 +541,12 @@ body: | ; W32-NEXT: .1: ; W32-NEXT: successors: %bb.2(0x80000000) ; W32-NEXT: {{ $}} - ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index 10d08032bf59a..a16bbeddde7f9 100644 --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -326,13 +326,12 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value( ; IR: Flow2: -; IR: %8 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ] -; IR: %9 = phi i1 [ false, %exit1 ], [ %13, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf.i64(i64 %17) +; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16) ; IR: UnifiedReturnBlock: -; IR: %UnifiedRetVal = phi float [ %8, %Flow2 ], [ 1.000000e+00, %exit0 ] -; IR: call void @llvm.amdgcn.end.cf.i64(i64 %12) +; IR: %UnifiedRetVal = phi float [ 2.000000e+00, %Flow2 ], [ 1.000000e+00, %exit0 ] +; IR: call void @llvm.amdgcn.end.cf.i64(i64 %11) ; IR: ret float %UnifiedRetVal define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 { entry: @@ -367,7 +366,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; GCN: {{^}}[[FLOW]]: ; GCN: s_or_b64 exec, exec -; GCN: v_mov_b32_e32 v0, s6 +; GCN: v_mov_b32_e32 v0, 2.0 ; GCN-NOT: s_and_b64 exec, exec ; GCN: v_mov_b32_e32 v0, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll index 6c62f3f225cd9..e1ba8c9aa5a0d 100644 --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -1,46 +1,47 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -mtriple=amdgcn-- -lowerswitch -structurizecfg -si-annotate-control-flow < %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Ensure two if.break calls, for both the inner and outer loops ; FIXME: duplicate comparison define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) { -; OPT-LABEL: @multi_else_break( -; OPT-NEXT: main_body: -; OPT-NEXT: br label [[LOOP_OUTER:%.*]] -; OPT: LOOP.outer: -; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP8:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ] -; OPT-NEXT: [[TMP43:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP3:%.*]], [[FLOW1]] ] -; OPT-NEXT: br label [[LOOP:%.*]] -; OPT: LOOP: -; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP6:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ] -; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP3]], [[FLOW]] ] -; OPT-NEXT: [[TMP48:%.*]] = icmp slt i32 [[TMP45]], [[UB:%.*]] +; OPT-LABEL: define amdgpu_vs void @multi_else_break( +; OPT-SAME: <4 x float> [[VEC:%.*]], i32 [[UB:%.*]], i32 [[CONT:%.*]]) { +; OPT-NEXT: [[MAIN_BODY:.*]]: +; OPT-NEXT: br label %[[LOOP_OUTER:.*]] +; OPT: [[LOOP_OUTER]]: +; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP8:%.*]], %[[FLOW1:.*]] ], [ 0, %[[MAIN_BODY]] ] +; OPT-NEXT: [[TMP43:%.*]] = phi i32 [ 0, %[[MAIN_BODY]] ], [ [[TMP3:%.*]], %[[FLOW1]] ] +; OPT-NEXT: br label %[[LOOP:.*]] +; OPT: [[LOOP]]: +; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP6:%.*]], %[[FLOW:.*]] ], [ 0, %[[LOOP_OUTER]] ] +; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], %[[LOOP_OUTER]] ], [ [[TMP3]], %[[FLOW]] ] +; OPT-NEXT: [[TMP48:%.*]] = icmp slt i32 [[TMP45]], [[UB]] ; OPT-NEXT: [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]]) ; OPT-NEXT: [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0 ; OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1 -; OPT-NEXT: br i1 [[TMP1]], label [[ENDIF:%.*]], label [[FLOW]] -; OPT: Flow: -; OPT-NEXT: [[TMP3]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ undef, [[LOOP]] ] -; OPT-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] -; OPT-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] +; OPT-NEXT: br i1 [[TMP1]], label %[[ENDIF:.*]], label %[[FLOW]] +; OPT: [[FLOW]]: +; OPT-NEXT: [[TMP3]] = phi i32 [ [[TMP47:%.*]], %[[ENDIF]] ], [ poison, %[[LOOP]] ] +; OPT-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP51:%.*]], %[[ENDIF]] ], [ true, %[[LOOP]] ] +; OPT-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP51_INV:%.*]], %[[ENDIF]] ], [ true, %[[LOOP]] ] ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]) ; OPT-NEXT: [[TMP6]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP5]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP7:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP6]]) -; OPT-NEXT: br i1 [[TMP7]], label [[FLOW1]], label [[LOOP]] -; OPT: Flow1: +; OPT-NEXT: br i1 [[TMP7]], label %[[FLOW1]], label %[[LOOP]] +; OPT: [[FLOW1]]: ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]]) ; OPT-NEXT: [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN2]]) ; OPT-NEXT: [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]]) -; OPT-NEXT: br i1 [[TMP9]], label [[IF:%.*]], label [[LOOP_OUTER]] -; OPT: IF: +; OPT-NEXT: br i1 [[TMP9]], label %[[IF:.*]], label %[[LOOP_OUTER]] +; OPT: [[IF]]: ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) ; OPT-NEXT: ret void -; OPT: ENDIF: +; OPT: [[ENDIF]]: ; OPT-NEXT: [[TMP47]] = add i32 [[TMP45]], 1 -; OPT-NEXT: [[TMP51]] = icmp eq i32 [[TMP47]], [[CONT:%.*]] +; OPT-NEXT: [[TMP51]] = icmp eq i32 [[TMP47]], [[CONT]] ; OPT-NEXT: [[TMP51_INV]] = xor i1 [[TMP51]], true -; OPT-NEXT: br label [[FLOW]] +; OPT-NEXT: br label %[[FLOW]] ; ; GCN-LABEL: multi_else_break: ; GCN: ; %bb.0: ; %main_body @@ -113,55 +114,52 @@ ENDIF: ; preds = %LOOP } define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { -; OPT-LABEL: @multi_if_break_loop( -; OPT-NEXT: bb: +; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop( +; OPT-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[BB:.*]]: ; OPT-NEXT: [[ID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; OPT-NEXT: [[TMP:%.*]] = sub i32 [[ID]], [[ARG:%.*]] -; OPT-NEXT: br label [[BB1:%.*]] -; OPT: bb1: -; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP4:%.*]], [[FLOW4:%.*]] ], [ 0, [[BB:%.*]] ] -; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[TMP2:%.*]], [[FLOW4]] ] -; OPT-NEXT: [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1 -; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0 +; OPT-NEXT: [[TMP:%.*]] = sub i32 [[ID]], [[ARG]] +; OPT-NEXT: br label %[[BB1:.*]] +; OPT: [[BB1]]: +; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP4:%.*]], %[[FLOW4:.*]] ], [ 0, %[[BB]] ] +; OPT-NEXT: [[LSR_IV:%.*]] = phi i32 [ poison, %[[BB]] ], [ [[TMP2:%.*]], %[[FLOW4]] ] +; OPT-NEXT: [[TMP2]] = add i32 [[LSR_IV]], 1 +; OPT-NEXT: [[CMP0:%.*]] = icmp slt i32 [[TMP2]], 0 ; OPT-NEXT: [[LOAD0:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4 -; OPT-NEXT: br label [[NODEBLOCK:%.*]] -; OPT: NodeBlock: +; OPT-NEXT: br label %[[NODEBLOCK:.*]] +; OPT: [[NODEBLOCK]]: ; OPT-NEXT: [[PIVOT:%.*]] = icmp sge i32 [[LOAD0]], 1 -; OPT-NEXT: br i1 [[PIVOT]], label [[LEAFBLOCK1:%.*]], label [[FLOW:%.*]] -; OPT: LeafBlock1: +; OPT-NEXT: br i1 [[PIVOT]], label %[[LEAFBLOCK1:.*]], label %[[FLOW:.*]] +; OPT: [[LEAFBLOCK1]]: ; OPT-NEXT: [[SWITCHLEAF2:%.*]] = icmp eq i32 [[LOAD0]], 1 -; OPT-NEXT: br i1 [[SWITCHLEAF2]], label [[CASE1:%.*]], label [[FLOW3:%.*]] -; OPT: Flow3: -; OPT-NEXT: [[TMP0:%.*]] = phi i32 [ [[LSR_IV_NEXT]], [[CASE1]] ], [ undef, [[LEAFBLOCK1]] ] -; OPT-NEXT: [[TMP1:%.*]] = phi i1 [ [[CMP2:%.*]], [[CASE1]] ], [ true, [[LEAFBLOCK1]] ] -; OPT-NEXT: br label [[FLOW]] -; OPT: LeafBlock: +; OPT-NEXT: br i1 [[SWITCHLEAF2]], label %[[CASE1:.*]], label %[[FLOW3:.*]] +; OPT: [[FLOW3]]: +; OPT-NEXT: [[TMP1:%.*]] = phi i1 [ [[CMP2:%.*]], %[[CASE1]] ], [ true, %[[LEAFBLOCK1]] ] +; OPT-NEXT: br label %[[FLOW]] +; OPT: [[LEAFBLOCK:.*]]: ; OPT-NEXT: [[SWITCHLEAF:%.*]] = icmp eq i32 [[LOAD0]], 0 -; OPT-NEXT: br i1 [[SWITCHLEAF]], label [[CASE0:%.*]], label [[FLOW5:%.*]] -; OPT: Flow4: -; OPT-NEXT: [[TMP2]] = phi i32 [ [[TMP9:%.*]], [[FLOW5]] ], [ [[TMP6:%.*]], [[FLOW]] ] -; OPT-NEXT: [[TMP3:%.*]] = phi i1 [ [[TMP10:%.*]], [[FLOW5]] ], [ [[TMP7:%.*]], [[FLOW]] ] +; OPT-NEXT: br i1 [[SWITCHLEAF]], label %[[CASE0:.*]], label %[[FLOW5:.*]] +; OPT: [[FLOW4]]: +; OPT-NEXT: [[TMP3:%.*]] = phi i1 [ [[TMP10:%.*]], %[[FLOW5]] ], [ [[TMP7:%.*]], %[[FLOW]] ] ; OPT-NEXT: [[TMP4]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP3]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP5:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP4]]) -; OPT-NEXT: br i1 [[TMP5]], label [[BB9:%.*]], label [[BB1]] -; OPT: case0: +; OPT-NEXT: br i1 [[TMP5]], label %[[BB9:.*]], label %[[BB1]] +; OPT: [[CASE0]]: ; OPT-NEXT: [[LOAD1:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4 ; OPT-NEXT: [[CMP1:%.*]] = icmp sge i32 [[TMP]], [[LOAD1]] -; OPT-NEXT: br label [[FLOW5]] -; OPT: Flow: -; OPT-NEXT: [[TMP6]] = phi i32 [ [[TMP0]], [[FLOW3]] ], [ undef, [[NODEBLOCK]] ] -; OPT-NEXT: [[TMP7]] = phi i1 [ [[TMP1]], [[FLOW3]] ], [ true, [[NODEBLOCK]] ] -; OPT-NEXT: [[TMP8:%.*]] = phi i1 [ false, [[FLOW3]] ], [ true, [[NODEBLOCK]] ] -; OPT-NEXT: br i1 [[TMP8]], label [[LEAFBLOCK:%.*]], label [[FLOW4]] -; OPT: case1: +; OPT-NEXT: br label %[[FLOW5]] +; OPT: [[FLOW]]: +; OPT-NEXT: [[TMP7]] = phi i1 [ [[TMP1]], %[[FLOW3]] ], [ true, %[[NODEBLOCK]] ] +; OPT-NEXT: [[TMP8:%.*]] = phi i1 [ false, %[[FLOW3]] ], [ true, %[[NODEBLOCK]] ] +; OPT-NEXT: br i1 [[TMP8]], label %[[LEAFBLOCK]], label %[[FLOW4]] +; OPT: [[CASE1]]: ; OPT-NEXT: [[LOAD2:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4 ; OPT-NEXT: [[CMP2]] = icmp sge i32 [[TMP]], [[LOAD2]] -; OPT-NEXT: br label [[FLOW3]] -; OPT: Flow5: -; OPT-NEXT: [[TMP9]] = phi i32 [ [[LSR_IV_NEXT]], [[CASE0]] ], [ undef, [[LEAFBLOCK]] ] -; OPT-NEXT: [[TMP10]] = phi i1 [ [[CMP1]], [[CASE0]] ], [ [[TMP7]], [[LEAFBLOCK]] ] -; OPT-NEXT: br label [[FLOW4]] -; OPT: bb9: +; OPT-NEXT: br label %[[FLOW3]] +; OPT: [[FLOW5]]: +; OPT-NEXT: [[TMP10]] = phi i1 [ [[CMP1]], %[[CASE0]] ], [ [[TMP7]], %[[LEAFBLOCK]] ] +; OPT-NEXT: br label %[[FLOW4]] +; OPT: [[BB9]]: ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP4]]) ; OPT-NEXT: ret void ; @@ -227,7 +225,7 @@ bb: br label %bb1 bb1: - %lsr.iv = phi i32 [ undef, %bb ], [ %lsr.iv.next, %case0 ], [ %lsr.iv.next, %case1 ] + %lsr.iv = phi i32 [ poison, %bb ], [ %lsr.iv.next, %case0 ], [ %lsr.iv.next, %case1 ] %lsr.iv.next = add i32 %lsr.iv, 1 %cmp0 = icmp slt i32 %lsr.iv.next, 0 %load0 = load volatile i32, ptr addrspace(1) undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll index bd6ef9e088b12..fd0cafcbe4708 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -236,8 +236,8 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar ; IR-NEXT: [[MY_TMP17:%.*]] = bitcast i64 [[MY_TMP3]] to <2 x i32> ; IR-NEXT: br label %[[BB18:.*]] ; IR: [[FLOW1]]: -; IR-NEXT: [[TMP11]] = phi <4 x i32> [ [[MY_TMP9:%.*]], %[[BB21:.*]] ], [ undef, %[[BB14]] ] -; IR-NEXT: [[TMP12]] = phi i32 [ [[MY_TMP10:%.*]], %[[BB21]] ], [ undef, %[[BB14]] ] +; IR-NEXT: [[TMP11]] = phi <4 x i32> [ [[MY_TMP9:%.*]], %[[BB21:.*]] ], [ poison, %[[BB14]] ] +; IR-NEXT: [[TMP12]] = phi i32 [ [[MY_TMP10:%.*]], %[[BB21]] ], [ poison, %[[BB14]] ] ; IR-NEXT: [[TMP13:%.*]] = phi i1 [ [[MY_TMP12:%.*]], %[[BB21]] ], [ true, %[[BB14]] ] ; IR-NEXT: [[TMP14]] = phi i1 [ [[MY_TMP12]], %[[BB21]] ], [ false, %[[BB14]] ] ; IR-NEXT: [[TMP15:%.*]] = phi i1 [ false, %[[BB21]] ], [ true, %[[BB14]] ] diff --git a/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir b/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir index b501fd037574b..bb9d22fedf38d 100644 --- a/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir +++ b/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir @@ -56,7 +56,7 @@ body: | ; GFX9-NEXT: bb.1: ; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_AND_B32_e32_]], implicit $exec + ; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_AND_B32_e32_]], implicit $exec ; GFX9-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[V_AND_B32_e32_]], implicit $exec ; GFX9-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: S_SET_GPR_IDX_ON [[V_READFIRSTLANE_B32_]], 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode @@ -99,7 +99,7 @@ body: | bb.1: successors: %bb.1, %bb.2 - %21:sgpr_32 = V_READFIRSTLANE_B32 %19, implicit $exec + %21:sreg_32_xm0 = V_READFIRSTLANE_B32 %19, implicit $exec %22:sreg_64 = V_CMP_EQ_U32_e64 %21, %19, implicit $exec %23:sreg_64 = S_AND_SAVEEXEC_B64 killed %22, implicit-def $exec, implicit-def dead $scc, implicit $exec S_SET_GPR_IDX_ON killed %21, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll index 080a9bd262ccb..ce6c3ad5af1e8 100644 --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -188,6 +188,38 @@ define hidden void @_ZL3barv() #0 !dbg !1644 { ; CHECK-NEXT: .cfi_undefined 59 ; CHECK-NEXT: .cfi_undefined 60 ; CHECK-NEXT: .cfi_undefined 61 +; CHECK-NEXT: .cfi_undefined 72 +; CHECK-NEXT: .cfi_undefined 73 +; CHECK-NEXT: .cfi_undefined 74 +; CHECK-NEXT: .cfi_undefined 75 +; CHECK-NEXT: .cfi_undefined 76 +; CHECK-NEXT: .cfi_undefined 77 +; CHECK-NEXT: .cfi_undefined 78 +; CHECK-NEXT: .cfi_undefined 79 +; CHECK-NEXT: .cfi_undefined 88 +; CHECK-NEXT: .cfi_undefined 89 +; CHECK-NEXT: .cfi_undefined 90 +; CHECK-NEXT: .cfi_undefined 91 +; CHECK-NEXT: .cfi_undefined 92 +; CHECK-NEXT: .cfi_undefined 93 +; CHECK-NEXT: .cfi_undefined 94 +; CHECK-NEXT: .cfi_undefined 95 +; CHECK-NEXT: .cfi_undefined 1096 +; CHECK-NEXT: .cfi_undefined 1097 +; CHECK-NEXT: .cfi_undefined 1098 +; CHECK-NEXT: .cfi_undefined 1099 +; CHECK-NEXT: .cfi_undefined 1100 +; CHECK-NEXT: .cfi_undefined 1101 +; CHECK-NEXT: .cfi_undefined 1102 +; CHECK-NEXT: .cfi_undefined 1103 +; CHECK-NEXT: .cfi_undefined 1112 +; CHECK-NEXT: .cfi_undefined 1113 +; CHECK-NEXT: .cfi_undefined 1114 +; CHECK-NEXT: .cfi_undefined 1115 +; CHECK-NEXT: .cfi_undefined 1116 +; CHECK-NEXT: .cfi_undefined 1117 +; CHECK-NEXT: .cfi_undefined 1118 +; CHECK-NEXT: .cfi_undefined 1119 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index b8be5b300bb7b..2c2058473e235 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -620,9 +620,11 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388606 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max: @@ -651,9 +653,11 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 16777214 %load = load i8, ptr %gep, align 4 @@ -825,9 +829,11 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8388607 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max: @@ -838,9 +844,11 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -16777215 %load = load i8, ptr %gep, align 4 @@ -884,9 +892,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0: @@ -915,9 +925,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589936639 %load = load i8, ptr %gep, align 4 @@ -961,9 +973,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1: @@ -992,9 +1006,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589936640 %load = load i8, ptr %gep, align 4 @@ -1038,9 +1054,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0: @@ -1069,9 +1087,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589938687 %load = load i8, ptr %gep, align 4 @@ -1115,9 +1135,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1: @@ -1128,9 +1150,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589938688 %load = load i8, ptr %gep, align 4 @@ -1174,9 +1198,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0: @@ -1205,9 +1231,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589942783 %load = load i8, ptr %gep, align 4 @@ -1251,9 +1279,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1: @@ -1264,9 +1294,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 8589942784 %load = load i8, ptr %gep, align 4 @@ -1311,9 +1343,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: @@ -1334,9 +1368,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854773761 %load = load i8, ptr %gep, align 4 @@ -1381,9 +1417,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: @@ -1404,9 +1442,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854773760 %load = load i8, ptr %gep, align 4 @@ -1451,9 +1491,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: @@ -1474,9 +1516,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854771713 %load = load i8, ptr %gep, align 4 @@ -1521,9 +1565,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: @@ -1544,9 +1590,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854771712 %load = load i8, ptr %gep, align 4 @@ -1591,9 +1639,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: @@ -1614,9 +1664,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854767617 %load = load i8, ptr %gep, align 4 @@ -1661,9 +1713,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: @@ -1684,9 +1738,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr %p, i64 -9223372036854767616 %load = load i8, ptr %gep, align 4 @@ -2657,7 +2713,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2752,7 +2808,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2847,7 +2903,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2943,7 +2999,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3039,7 +3095,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3135,7 +3191,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll index fd62ba3f9da1f..d16d731c34384 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -659,9 +659,11 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max: @@ -699,9 +701,11 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8388606 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 16777214 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -869,9 +873,11 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max: @@ -909,9 +915,11 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8388607 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16777215 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -956,9 +964,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0: @@ -996,9 +1006,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1042,9 +1054,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1: @@ -1073,9 +1087,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1119,9 +1135,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0: @@ -1159,9 +1177,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1205,9 +1225,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1: @@ -1218,9 +1240,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1264,9 +1288,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0: @@ -1304,9 +1330,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1350,9 +1378,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1: @@ -1363,9 +1393,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1410,9 +1442,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: @@ -1451,9 +1485,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386561 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1498,9 +1534,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: @@ -1530,9 +1568,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386560 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1577,9 +1617,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: @@ -1618,9 +1660,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384513 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1665,9 +1709,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: @@ -1688,9 +1734,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384512 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1735,9 +1783,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: @@ -1776,9 +1826,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380417 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -1823,9 +1875,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: @@ -1846,9 +1900,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380416 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616 %load = load i8, ptr addrspace(1) %gep, align 4 @@ -2655,7 +2711,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -2748,7 +2804,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -2841,7 +2897,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -2934,7 +2990,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -3027,7 +3083,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -3120,7 +3176,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll index 3e45a2d0df43d..720eaeff2e1ec 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -40,15 +40,16 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-NEXT: s_lshl_b32 s12, s12, 5 ; GCN-NEXT: s_cbranch_vccz .LBB0_6 ; GCN-NEXT: ; %bb.5: ; in Loop: Header=BB0_4 Depth=2 -; GCN-NEXT: s_mov_b64 s[14:15], s[2:3] +; GCN-NEXT: s_mov_b64 s[16:17], s[2:3] ; GCN-NEXT: s_branch .LBB0_7 ; GCN-NEXT: .LBB0_6: ; %bb3 ; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2 ; GCN-NEXT: s_add_i32 s12, s12, 1 -; GCN-NEXT: s_mov_b64 s[14:15], -1 +; GCN-NEXT: s_mov_b64 s[16:17], -1 ; GCN-NEXT: .LBB0_7: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2 -; GCN-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GCN-NEXT: s_mov_b64 s[14:15], -1 +; GCN-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GCN-NEXT: s_mov_b64 s[16:17], -1 ; GCN-NEXT: s_cbranch_vccnz .LBB0_3 ; GCN-NEXT: ; %bb.8: ; %bb4 diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll index 2e9f09ad41813..89bcfb3b3a834 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals ; Check that no attributes are added to graphics functions -; RUN: opt -S -mtriple=amdgcn-amd-amdpal -amdgpu-annotate-kernel-features %s | FileCheck -check-prefixes=AKF_GCN %s ; RUN: opt -S -mtriple=amdgcn-amd-amdpal -passes=amdgpu-attributor %s | FileCheck -check-prefixes=ATTRIBUTOR_GCN %s ; Check that it doesn't crash @@ -8,16 +7,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -global-isel < %s | FileCheck -check-prefixes=GFX10 %s -target datalayout = "A5" - - define amdgpu_cs void @test_simple_indirect_call() { -; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call() { -; AKF_GCN-NEXT: [[PC:%.*]] = call i64 @llvm.amdgcn.s.getpc() -; AKF_GCN-NEXT: [[FUN:%.*]] = inttoptr i64 [[PC]] to ptr -; AKF_GCN-NEXT: call amdgpu_gfx void [[FUN]]() -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call ; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: [[PC:%.*]] = call i64 @llvm.amdgcn.s.getpc() @@ -68,8 +58,6 @@ declare i64 @llvm.amdgcn.s.getpc() #0 attributes #0 = { nounwind readnone speculatable willreturn } ;. -; AKF_GCN: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir index 8552d355b020e..71e7ca11a86cd 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir @@ -27,7 +27,7 @@ body: | liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs - ; CHECK: liveins: $vgpr1, $vgpr2 + ; CHECK: liveins: $sgpr40, $sgpr41, $vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 @@ -58,42 +58,27 @@ body: | ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 - ; CHECK-NEXT: $sgpr4 = COPY $sgpr33 + ; CHECK-NEXT: $sgpr40 = frame-setup COPY $sgpr33 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION register $sgpr33, $sgpr40 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr2, 1048832 - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr33, $vgpr2, 0, 32 - ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr34, $vgpr2, 1, 32 + ; CHECK-NEXT: $sgpr41 = frame-setup COPY $sgpr34 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION register $sgpr34, $sgpr41 ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $sgpr33 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc - ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr42 = S_MOV_B32 8192 + ; CHECK-NEXT: $vgpr0, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr0, 0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr42 = S_MOV_B32 16384 + ; CHECK-NEXT: $vgpr2, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr2, 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 - ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc - ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr41 ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_register $sgpr32 - ; CHECK-NEXT: $sgpr33 = COPY $sgpr4 + ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr40 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -123,7 +108,7 @@ body: | liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr - ; CHECK: liveins: $sgpr29, $vgpr1, $vgpr2 + ; CHECK: liveins: $sgpr29, $sgpr40, $vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 @@ -157,34 +142,21 @@ body: | ; CHECK-NEXT: frame-setup CFI_INSTRUCTION register $sgpr33, $sgpr29 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr2, 1048832 - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 0, undef $vgpr2 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr34, $vgpr2, 0, 32 + ; CHECK-NEXT: $sgpr40 = frame-setup COPY $sgpr34 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION register $sgpr34, $sgpr40 ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $sgpr33 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc - ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31 + ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr42 = S_MOV_B32 8192 + ; CHECK-NEXT: $vgpr0, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr0, 0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr42 = S_MOV_B32 16384 + ; CHECK-NEXT: $vgpr2, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr2, 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31 ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc - ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr40 ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_register $sgpr32 ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr29 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc @@ -255,16 +227,12 @@ body: | ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $sgpr33 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc - ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr40 = S_MOV_B32 8192 + ; CHECK-NEXT: $vgpr0, dead $sgpr40_sgpr41 = V_ADD_CO_U32_e64 killed $sgpr40, killed $vgpr0, 0, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr40 = S_MOV_B32 16384 + ; CHECK-NEXT: $vgpr2, dead $sgpr40_sgpr41 = V_ADD_CO_U32_e64 killed $sgpr40, killed $vgpr2, 0, implicit $exec ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr31 ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr29 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir index c764e150fe754..7c4e03fd0e6df 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir @@ -23,7 +23,7 @@ body: | liveins: $vgpr1 ; MUBUF-LABEL: name: scavenge_sgpr_pei_no_sgprs - ; MUBUF: liveins: $vgpr1, $vgpr2 + ; MUBUF: liveins: $sgpr40, $sgpr41, $vgpr1 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 @@ -54,40 +54,29 @@ body: | ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 - ; MUBUF-NEXT: $sgpr4 = COPY $sgpr33 + ; MUBUF-NEXT: $sgpr40 = frame-setup COPY $sgpr33 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION register $sgpr33, $sgpr40 ; MUBUF-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; MUBUF-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc - ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr2, 1048832 - ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 - ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr33, $vgpr2, 0, 32 - ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 - ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr34, $vgpr2, 1, 32 + ; MUBUF-NEXT: $sgpr41 = frame-setup COPY $sgpr34 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION register $sgpr34, $sgpr41 ; MUBUF-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $sgpr33 ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; MUBUF-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; MUBUF-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec - ; MUBUF-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec - ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; MUBUF-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; MUBUF-NEXT: $vgpr2 = V_ADD_U32_e32 16384, killed $vgpr2, implicit $exec + ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; MUBUF-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; MUBUF-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; MUBUF-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 - ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc - ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; MUBUF-NEXT: $sgpr34 = frame-destroy COPY $sgpr41 ; MUBUF-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_register $sgpr32 - ; MUBUF-NEXT: $sgpr33 = COPY $sgpr4 + ; MUBUF-NEXT: $sgpr33 = frame-destroy COPY $sgpr40 ; MUBUF-NEXT: S_ENDPGM 0, implicit $vcc ; ; FLATSCR-LABEL: name: scavenge_sgpr_pei_no_sgprs - ; FLATSCR: liveins: $vgpr1, $vgpr2 + ; FLATSCR: liveins: $sgpr40, $sgpr41, $vgpr1 ; FLATSCR-NEXT: {{ $}} ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02 ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 @@ -118,37 +107,24 @@ body: | ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 - ; FLATSCR-NEXT: $sgpr4 = COPY $sgpr33 + ; FLATSCR-NEXT: $sgpr40 = frame-setup COPY $sgpr33 + ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION register $sgpr33, $sgpr40 ; FLATSCR-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; FLATSCR-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc - ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc - ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) - ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr2, 1048832 - ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 - ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr33, $vgpr2, 0, 32 - ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 - ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr34, $vgpr2, 1, 32 + ; FLATSCR-NEXT: $sgpr41 = frame-setup COPY $sgpr34 + ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION register $sgpr34, $sgpr41 ; FLATSCR-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x41, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02 ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc - ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -8192, implicit-def $scc - ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc - ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 - ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -16384, implicit-def $scc + ; FLATSCR-NEXT: $sgpr42 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc + ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr42, implicit $exec + ; FLATSCR-NEXT: $sgpr42 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc + ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $sgpr42, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; FLATSCR-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; FLATSCR-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 - ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc - ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) - ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; FLATSCR-NEXT: $sgpr34 = frame-destroy COPY $sgpr41 ; FLATSCR-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02 - ; FLATSCR-NEXT: $sgpr33 = COPY $sgpr4 + ; FLATSCR-NEXT: $sgpr33 = frame-destroy COPY $sgpr40 ; FLATSCR-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir index 57c24aa152eab..cd335321e2156 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir @@ -22,7 +22,7 @@ body: | liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei - ; CHECK: liveins: $vgpr1, $vgpr2 + ; CHECK: liveins: $sgpr40, $sgpr41, $vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 @@ -53,37 +53,24 @@ body: | ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 - ; CHECK-NEXT: $sgpr4 = COPY $sgpr33 + ; CHECK-NEXT: $sgpr40 = frame-setup COPY $sgpr33 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION register $sgpr33, $sgpr40 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 262080, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294705152, implicit-def dead $scc - ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 262400, implicit-def dead $scc - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr2, 262400 - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr33, $vgpr2, 0, 32 - ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr34, $vgpr2, 1, 32 + ; CHECK-NEXT: $sgpr41 = frame-setup COPY $sgpr34 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION register $sgpr34, $sgpr41 ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $sgpr33 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 786432, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 4096, implicit-def $scc - ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33 - ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -4096, implicit-def $scc - ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK-NEXT: $sgpr42 = S_MOV_B32 4096 + ; CHECK-NEXT: $vgpr2, dead $sgpr42_sgpr43 = V_ADD_CO_U32_e64 killed $sgpr42, killed $vgpr2, 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 - ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 262400, implicit-def dead $scc - ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) - ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr41 ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_register $sgpr32 - ; CHECK-NEXT: $sgpr33 = COPY $sgpr4 + ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr40 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_OR_B32_e32 %stack.0, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir index 47647ca1914d7..fb3e8116d86a4 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -59,6 +59,38 @@ body: | ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX8-NEXT: $sgpr4 = COPY $sgpr33 ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc @@ -128,6 +160,38 @@ body: | ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX9-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX9-NEXT: $sgpr4 = COPY $sgpr33 ; GFX9-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX9-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc @@ -195,6 +259,38 @@ body: | ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX9-FLATSCR-NEXT: $sgpr4 = COPY $sgpr33 ; GFX9-FLATSCR-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX9-FLATSCR-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll index aeb7faade4715..830d7cc840aeb 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) { ; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x( @@ -39,7 +39,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i5 ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] { +; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i512 [[TMP0:%.*]]) #[[ATTR0]] { ; PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll index ab0fb7584d50c..1a445af94b9ad 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s define amdgpu_kernel void @test_preload_IR_lowering_kernel_2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 @@ -185,7 +185,7 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %i ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] { +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-8-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 ; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]] @@ -220,14 +220,10 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr ad ; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 -; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset @@ -235,14 +231,10 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr ad ; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] ; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset @@ -270,22 +262,16 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr ad ret void } -; Only preload the first sequence of arguments with the inreg attribute. In the NO-PRELOAD case this is just the first argument. - define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence(ptr addrspace(1) inreg %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence ; NO-PRELOAD-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 -; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence @@ -293,14 +279,10 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_se ; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] ; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll deleted file mode 100644 index 20edbd6c0d0fa..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll +++ /dev/null @@ -1,263 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=NO-PRELOAD %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-1 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=3 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-3 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=16 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-16 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=20 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-20 %s - -define amdgpu_kernel void @test_preload_hint_kernel_1(ptr %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-1-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-3-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-16-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-20-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_2(i32 %0, i64 %1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_4(i32 %0, i64 %1, <2 x float> %2, ptr %3) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_18(i32 %0, i64 %1, <2 x float> %2, ptr %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14, i32 %15, i32 %16, i32 %17) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define void @test_preload_hint_non_kernel_2(i32 %0, i64 %1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-1-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-3-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-16-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-20-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_1_call_func(ptr %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; NO-PRELOAD-NEXT: call void @func(ptr [[TMP0]]) -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-1-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-1-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-3-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-3-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-16-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-16-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-20-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-20-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-20-NEXT: ret void -; - call void @func(ptr %0) - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_1_call_intrinsic(i16 %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; NO-PRELOAD-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-1-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-3-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-16-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-20-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-20-NEXT: ret void -; - call void @llvm.amdgcn.set.prio(i16 %0) - ret void -} - -define spir_kernel void @test_preload_hint_kernel_1_spir_cc(ptr %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-1-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-3-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-16-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-20-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_2_preexisting(i32 inreg %0, i64 %1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; NO-PRELOAD-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_incompatible_attributes(ptr addrspace(4) byref(i32) %0, ptr nest %1) { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -declare void @func(ptr) #0 -declare void @llvm.amdgcn.set.prio(i16) - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index 0f60888bcb2f5..20858bc603b99 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -566,13 +566,10 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x ; GFX940-NEXT: .p2align 8 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: .LBB14_0: -; GFX940-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 -; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 +; GFX940-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40 ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[12:13] ; GFX940-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1 ; GFX940-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: v_mov_b32_e32 v3, s11 @@ -583,6 +580,10 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x ; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: v_mov_b32_e32 v3, s7 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, s12 +; GFX940-NEXT: v_mov_b32_e32 v1, s13 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] offset:32 sc0 sc1 ; GFX940-NEXT: s_endpgm ; ; GFX90a-LABEL: v5f64_arg: @@ -593,13 +594,10 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB14_0: -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX90a-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; GFX90a-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90a-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 ; GFX90a-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-NEXT: v_mov_b32_e32 v3, s15 @@ -610,6 +608,10 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x ; GFX90a-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-NEXT: v_mov_b32_e32 v3, s11 ; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-NEXT: s_nop 0 +; GFX90a-NEXT: v_mov_b32_e32 v0, s16 +; GFX90a-NEXT: v_mov_b32_e32 v1, s17 +; GFX90a-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32 ; GFX90a-NEXT: s_endpgm store <5 x double> %in, ptr addrspace(1) %out, align 8 ret void @@ -941,17 +943,15 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB23_0: -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-NEXT: global_store_short v3, v0, s[6:7] ; GFX90a-NEXT: v_mov_b32_e32 v0, s13 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: global_store_short v3, v0, s[0:1] offset:12 +; GFX90a-NEXT: global_store_short v3, v0, s[14:15] offset:12 ; GFX90a-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-NEXT: v_mov_b32_e32 v0, s10 ; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[14:15] ; GFX90a-NEXT: s_endpgm store half %in, ptr addrspace(1) %out store <7 x bfloat> %in2, ptr addrspace(1) %out2 @@ -1191,15 +1191,13 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg % ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB29_0: -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-NEXT: v_mov_b32_e32 v4, s8 ; GFX90a-NEXT: v_mov_b32_e32 v0, s10 ; GFX90a-NEXT: v_mov_b32_e32 v1, s11 ; GFX90a-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-NEXT: global_store_short v3, v4, s[6:7] -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[14:15] ; GFX90a-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store <3 x i32> %in2, ptr addrspace(1) %out2 diff --git a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll index c68143f44866f..1d061a8e4b78e 100644 --- a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll +++ b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll @@ -2,6 +2,10 @@ ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto" -print-pipeline-passes %s -o - | FileCheck %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto" -print-pipeline-passes %s -o - | FileCheck %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto" -print-pipeline-passes %s -o - | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -O0 -print-pipeline-passes %s -o - | FileCheck --check-prefix=O0 %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -O1 -print-pipeline-passes %s -o - | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -O2 -print-pipeline-passes %s -o - | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -O3 -print-pipeline-passes %s -o - | FileCheck %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s @@ -9,9 +13,12 @@ ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s +; CHECK: amdgpu-expand-feature-predicates ; CHECK: amdgpu-attributor +; O0: amdgpu-expand-feature-predicates ; O0-NOT: amdgpu-attributor +; PRE: amdgpu-expand-feature-predicates ; PRE-NOT: internalize ; PRE-NOT: amdgpu-attributor diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll index e7b405d7d9270..24a4d8fbde200 100644 --- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -519,8 +519,8 @@ define amdgpu_kernel void @alloca_promote_atomicrmw_private_lds_promote(ptr addr ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s6, 1 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-NEXT: s_cselect_b32 s4, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm entry: @@ -561,8 +561,8 @@ define amdgpu_kernel void @alloca_promote_cmpxchg_private(ptr addrspace(1) %out, ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s6, 1 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-NEXT: s_cselect_b32 s4, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index a1197aeace86f..0eb186f5e3d87 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -1776,20 +1776,20 @@ entry: define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX8-LABEL: DiffBase: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s42, -1 -; GFX8-NEXT: s_mov_b32 s43, 0xe80000 -; GFX8-NEXT: s_add_u32 s40, s40, s11 -; GFX8-NEXT: s_addc_u32 s41, s41, 0 +; GFX8-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s50, -1 +; GFX8-NEXT: s_mov_b32 s51, 0xe80000 +; GFX8-NEXT: s_add_u32 s48, s48, s11 +; GFX8-NEXT: s_addc_u32 s49, s49, 0 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX8-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX8-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1839,20 +1839,20 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; ; GFX9-LABEL: DiffBase: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s42, -1 -; GFX9-NEXT: s_mov_b32 s43, 0xe00000 -; GFX9-NEXT: s_add_u32 s40, s40, s11 -; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s11 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1898,12 +1898,12 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; ; GFX10-LABEL: DiffBase: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s42, -1 -; GFX10-NEXT: s_mov_b32 s43, 0x31c16000 -; GFX10-NEXT: s_add_u32 s40, s40, s11 -; GFX10-NEXT: s_addc_u32 s41, s41, 0 +; GFX10-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s50, -1 +; GFX10-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX10-NEXT: s_add_u32 s48, s48, s11 +; GFX10-NEXT: s_addc_u32 s49, s49, 0 ; GFX10-NEXT: s_getpc_b64 s[0:1] ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 @@ -1911,8 +1911,8 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX10-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll index 1afd31c6d45e7..5169100c8d76e 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s ; Check propagation of amdgpu-flat-work-group-size attribute. @@ -170,10 +169,10 @@ define amdgpu_kernel void @kernel_64_256() #7 { define internal void @default_captured_address() { ; CHECK-LABEL: define {{[^@]+}}@default_captured_address ; CHECK-SAME: () #[[ATTR8:[0-9]+]] { -; CHECK-NEXT: store volatile ptr @default_captured_address, ptr undef, align 8 +; CHECK-NEXT: store volatile ptr @default_captured_address, ptr poison, align 8 ; CHECK-NEXT: ret void ; - store volatile ptr @default_captured_address, ptr undef, align 8 + store volatile ptr @default_captured_address, ptr poison, align 8 ret void } @@ -205,11 +204,11 @@ attributes #7 = { "amdgpu-flat-work-group-size"="64,256" } ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,1024" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,1024" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index 6a909f52082d6..7eecb76d780a0 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -14,7 +14,7 @@ define internal void @default_to_1_8_a() { define amdgpu_kernel void @kernel_1_8() #0 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_1_8 -; CHECK-SAME: () #[[ATTR0]] { +; CHECK-SAME: () #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: call void @default_to_1_8_a() ; CHECK-NEXT: ret void ; @@ -25,7 +25,7 @@ define amdgpu_kernel void @kernel_1_8() #0 { ; Called from a single kernel with 1,2 define internal void @default_to_1_2() { ; CHECK-LABEL: define internal void @default_to_1_2 -; CHECK-SAME: () #[[ATTR1:[0-9]+]] { +; CHECK-SAME: () #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -33,7 +33,7 @@ define internal void @default_to_1_2() { define amdgpu_kernel void @kernel_1_2() #1 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_1_2 -; CHECK-SAME: () #[[ATTR1]] { +; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: call void @default_to_1_2() ; CHECK-NEXT: call void @flat_group_1_1() ; CHECK-NEXT: call void @default_to_1_8_b() @@ -50,7 +50,7 @@ define amdgpu_kernel void @kernel_1_2() #1 { ; Called from a single kernel with 1,4 define internal void @default_to_1_4() { ; CHECK-LABEL: define internal void @default_to_1_4 -; CHECK-SAME: () #[[ATTR2:[0-9]+]] { +; CHECK-SAME: () #[[ATTR3:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -58,7 +58,7 @@ define internal void @default_to_1_4() { define amdgpu_kernel void @kernel_1_4() #2 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_1_4 -; CHECK-SAME: () #[[ATTR2]] { +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: call void @default_to_1_4() ; CHECK-NEXT: ret void ; @@ -69,7 +69,7 @@ define amdgpu_kernel void @kernel_1_4() #2 { ; Called from kernels with 2,9 and 9,9 define internal void @default_to_2_9() { ; CHECK-LABEL: define internal void @default_to_2_9 -; CHECK-SAME: () #[[ATTR3:[0-9]+]] { +; CHECK-SAME: () #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -79,7 +79,7 @@ define internal void @default_to_2_9() { ; bounds, and should not be changed. define internal void @flat_group_1_1() #3 { ; CHECK-LABEL: define internal void @flat_group_1_1 -; CHECK-SAME: () #[[ATTR4:[0-9]+]] { +; CHECK-SAME: () #[[ATTR5:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -88,7 +88,7 @@ define internal void @flat_group_1_1() #3 { ; 2,8 -> 2,2 define internal void @flat_group_2_8() #4 { ; CHECK-LABEL: define internal void @flat_group_2_8 -; CHECK-SAME: () #[[ATTR5:[0-9]+]] { +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: ret void ; ret void @@ -105,7 +105,7 @@ define internal void @flat_group_9_10() #5 { define amdgpu_kernel void @kernel_2_9() #6 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_2_9 -; CHECK-SAME: () #[[ATTR3]] { +; CHECK-SAME: () #[[ATTR7:[0-9]+]] { ; CHECK-NEXT: call void @default_to_2_9() ; CHECK-NEXT: call void @flat_group_1_1() ; CHECK-NEXT: ret void @@ -117,7 +117,7 @@ define amdgpu_kernel void @kernel_2_9() #6 { define amdgpu_kernel void @kernel_9_9() #7 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_9_9 -; CHECK-SAME: () #[[ATTR7:[0-9]+]] { +; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: call void @default_to_2_9() ; CHECK-NEXT: call void @flat_group_9_10() ; CHECK-NEXT: ret void @@ -140,7 +140,7 @@ define internal void @default_to_1_8_b() { ; this should probably be illegal. define amdgpu_kernel void @kernel_2_8() #4 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_2_8 -; CHECK-SAME: () #[[ATTR5]] { +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: call void @default_to_1_8_a() ; CHECK-NEXT: call void @default_to_1_8_b() ; CHECK-NEXT: ret void @@ -153,7 +153,7 @@ define amdgpu_kernel void @kernel_2_8() #4 { ; 1,2 -> 2,2 define internal void @merge_cycle_0() #1 { ; CHECK-LABEL: define internal void @merge_cycle_0 -; CHECK-SAME: () #[[ATTR1]] { +; CHECK-SAME: () #[[ATTR2]] { ; CHECK-NEXT: call void @merge_cycle_1() ; CHECK-NEXT: ret void ; @@ -165,7 +165,7 @@ define internal void @merge_cycle_0() #1 { ; 2,8 -> 2,8 define internal void @merge_cycle_1() #4 { ; CHECK-LABEL: define internal void @merge_cycle_1 -; CHECK-SAME: () #[[ATTR5]] { +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: call void @merge_cycle_0() ; CHECK-NEXT: ret void ; @@ -192,10 +192,10 @@ define amdgpu_kernel void @kernel_3_8() #8 { define internal void @default_captured_address() { ; CHECK-LABEL: define internal void @default_captured_address ; CHECK-SAME: () #[[ATTR9:[0-9]+]] { -; CHECK-NEXT: store volatile ptr @default_captured_address, ptr undef, align 8 +; CHECK-NEXT: store volatile ptr @default_captured_address, ptr poison, align 8 ; CHECK-NEXT: ret void ; - store volatile ptr @default_captured_address, ptr undef, align 8 + store volatile ptr @default_captured_address, ptr poison, align 8 ret void } @@ -218,7 +218,7 @@ define internal i32 @bitcasted_function() { define internal void @called_from_invalid_bounds_0() { ; CHECK-LABEL: define internal void @called_from_invalid_bounds_0 -; CHECK-SAME: () #[[ATTR10:[0-9]+]] { +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret void ; ret void @@ -226,7 +226,7 @@ define internal void @called_from_invalid_bounds_0() { define internal void @called_from_invalid_bounds_1() { ; CHECK-LABEL: define internal void @called_from_invalid_bounds_1 -; CHECK-SAME: () #[[ATTR10]] { +; CHECK-SAME: () #[[ATTR10:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -235,7 +235,7 @@ define internal void @called_from_invalid_bounds_1() { ; Invalid range for amdgpu-waves-per-eu define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8 -; CHECK-SAME: () #[[ATTR0]] { +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: call void @called_from_invalid_bounds_0() ; CHECK-NEXT: ret void ; @@ -399,14 +399,14 @@ attributes #17 = { "amdgpu-waves-per-eu"="5,8" } attributes #18 = { "amdgpu-waves-per-eu"="9,10" } attributes #19 = { "amdgpu-waves-per-eu"="8,9" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll index a439f8df10a26..e5e3ba6cdcaf0 100644 --- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll +++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll @@ -11,6 +11,7 @@ define amdgpu_cs float @v_s_exp_f32(float inreg %src) { ; GFX12-NEXT: s_add_f32 s0, s0, s1 ; GFX12-NEXT: s_cselect_b32 s1, 0xffffffc0, 0 ; GFX12-NEXT: v_s_exp_f32 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_ldexp_f32 v0, s0, s1 ; GFX12-NEXT: ; return to shader part epilog @@ -22,6 +23,7 @@ define amdgpu_cs half @v_s_exp_f16(half inreg %src) { ; GFX12-LABEL: v_s_exp_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_exp_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -33,6 +35,7 @@ define amdgpu_cs float @v_s_amdgcn_exp_f32(float inreg %src) { ; GFX12-LABEL: v_s_amdgcn_exp_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_exp_f32 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -44,6 +47,7 @@ define amdgpu_cs half @v_s_amdgcn_exp_f16(half inreg %src) { ; GFX12-LABEL: v_s_amdgcn_exp_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_exp_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -55,14 +59,9 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) { ; GFX12-SDAG-LABEL: v_s_log_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0x800000 -; GFX12-SDAG-NEXT: s_cselect_b32 s1, -1, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_ldexp_f32 v0, s0, v0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_and_b32 s0, s1, exec_lo +; GFX12-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_ldexp_f32 v0, s0, s1 ; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0 ; GFX12-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe @@ -92,6 +91,7 @@ define amdgpu_cs half @v_s_log_f16(half inreg %src) { ; GFX12-LABEL: v_s_log_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_log_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -103,6 +103,7 @@ define amdgpu_cs float @v_s_amdgcn_log_f32(float inreg %src) { ; GFX12-LABEL: v_s_amdgcn_log_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_log_f32 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -114,6 +115,7 @@ define amdgpu_cs half @v_s_amdgcn_log_f16(half inreg %src) { ; GFX12-LABEL: v_s_amdgcn_log_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_log_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -125,6 +127,7 @@ define amdgpu_cs float @v_s_rcp_f32(float inreg %src) { ; GFX12-LABEL: v_s_rcp_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_rcp_f32 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -136,6 +139,7 @@ define amdgpu_cs half @v_s_rcp_f16(half inreg %src) { ; GFX12-LABEL: v_s_rcp_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_rcp_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -148,6 +152,7 @@ define amdgpu_cs float @v_s_rsq_f32(float inreg %src) { ; GFX12-SDAG-LABEL: v_s_rsq_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_s_rsq_f32 s0, s0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: ; return to shader part epilog @@ -155,8 +160,10 @@ define amdgpu_cs float @v_s_rsq_f32(float inreg %src) { ; GFX12-GISEL-LABEL: v_s_rsq_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_s_sqrt_f32 s0, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) ; GFX12-GISEL-NEXT: v_s_rcp_f32 s0, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %sqrt = call fast float @llvm.sqrt.f32(float %src) @@ -168,6 +175,7 @@ define amdgpu_cs half @v_s_rsq_f16(half inreg %src) { ; GFX12-LABEL: v_s_rsq_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_rsq_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -185,7 +193,6 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_cselect_b32 s1, s1, s0 ; GFX12-SDAG-NEXT: v_s_sqrt_f32 s2, s1 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_mov_b32 s4, s1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s3, s2, -1 @@ -223,7 +230,6 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_cselect_b32 s0, s2, s0 ; GFX12-GISEL-NEXT: v_s_sqrt_f32 s2, s0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_mov_b32 s4, s0 ; GFX12-GISEL-NEXT: s_mov_b32 s6, s0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -241,11 +247,12 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-GISEL-NEXT: s_cselect_b32 s2, s5, s2 ; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-GISEL-NEXT: s_mul_f32 s3, s2, 0x37800000 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: s_cselect_b32 s1, s3, s2 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-GISEL-NEXT: v_cmp_class_f32_e64 s1, s0, 0x260 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.sqrt.f32(float %src) @@ -256,6 +263,7 @@ define amdgpu_cs half @v_s_sqrt_f16(half inreg %src) { ; GFX12-LABEL: v_s_sqrt_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_sqrt_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -267,6 +275,7 @@ define amdgpu_cs float @v_amdgcn_sqrt_f32(float inreg %src) { ; GFX12-LABEL: v_amdgcn_sqrt_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_sqrt_f32 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -278,6 +287,7 @@ define amdgpu_cs half @v_amdgcn_sqrt_f16(half inreg %src) { ; GFX12-LABEL: v_amdgcn_sqrt_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_sqrt_f16 s0, s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -291,13 +301,8 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) { ; GFX12-SDAG-NEXT: s_and_b32 s1, s0, 0x7fffffff ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_cmp_lt_f32 s1, 0x800000 -; GFX12-SDAG-NEXT: s_cselect_b32 s1, -1, 0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX12-SDAG-NEXT: v_ldexp_f32 v0, |s0|, v0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_and_b32 s0, s1, exec_lo +; GFX12-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX12-SDAG-NEXT: v_ldexp_f32 v0, |s0|, s1 ; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) ; GFX12-SDAG-NEXT: v_log_f32_e32 v0, v0 @@ -329,14 +334,9 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) { ; GFX12-SDAG-LABEL: srcmods_neg_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_cmp_gt_f32 s0, 0x80800000 -; GFX12-SDAG-NEXT: s_cselect_b32 s1, -1, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_ldexp_f32 v0, -s0, v0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_and_b32 s0, s1, exec_lo +; GFX12-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_ldexp_f32 v0, -s0, s1 ; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0 ; GFX12-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe @@ -368,6 +368,7 @@ define amdgpu_cs half @srcmods_abs_f16(half inreg %src) { ; GFX12-LABEL: srcmods_abs_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_log_f16 s0, |s0| +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog @@ -380,6 +381,7 @@ define amdgpu_cs half @srcmods_neg_f16(half inreg %src) { ; GFX12-LABEL: srcmods_neg_f16: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_s_log_f16 s0, -s0 +; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index c6ee557d970cd..4a0bb6ceccd3f 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -41,63 +41,102 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr36_sgpr37 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr38_sgpr39 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr40_sgpr41 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr60 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr42_sgpr43 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit $exec - ; CHECK-NEXT: renamable $sgpr44_sgpr45 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr61 = S_MOV_B32 1083786240 + ; CHECK-NEXT: renamable $sgpr18_sgpr19 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit $exec + ; CHECK-NEXT: renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 1083786240 + ; CHECK-NEXT: SI_SPILL_S1024_SAVE renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr44_sgpr45, implicit-def dead $scc + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]] ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.5(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr64 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr65 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr66 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr67 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr68 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr69 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr70 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr71 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr72 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr73 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr74 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr75 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr76 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr77 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr78 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr79 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr80 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr81 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr82 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr83 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr84 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr85 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr86 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr87 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr88 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr89 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr90 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr91 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr92 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr93 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr94 = COPY renamable $sgpr60 - ; CHECK-NEXT: renamable $sgpr95 = COPY renamable $sgpr60 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $exec + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr40 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr41 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr42 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr43 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr44 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr45 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr46 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr47 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr48 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr49 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr50 = COPY renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr51 = COPY killed renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 + ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr56 = COPY killed renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 + ; CHECK-NEXT: renamable $sgpr52 = COPY renamable $sgpr56 + ; CHECK-NEXT: renamable $sgpr53 = COPY killed renamable $sgpr76 + ; CHECK-NEXT: renamable $sgpr56_sgpr57 = COPY renamable $sgpr52_sgpr53 + ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 + ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 + ; CHECK-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr56_sgpr57 + ; CHECK-NEXT: renamable $sgpr54 = COPY killed renamable $sgpr76 + ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 + ; CHECK-NEXT: renamable $sgpr48_sgpr49_sgpr50 = COPY renamable $sgpr52_sgpr53_sgpr54 + ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr48_sgpr49_sgpr50 + ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 + ; CHECK-NEXT: renamable $sgpr55 = COPY killed renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr56 = COPY killed renamable $sgpr72 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr57 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr58 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr59 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr60 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr61 = COPY killed renamable $sgpr80 + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr62 = COPY killed renamable $sgpr80 + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr63 = COPY killed renamable $sgpr80 + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr64 = COPY killed renamable $sgpr80 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr65 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr66 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr67 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 + ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = COPY renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 + ; CHECK-NEXT: renamable $sgpr64 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr65 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr66 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr67 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} @@ -126,111 +165,117 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.12(0x40000000), %bb.6(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 renamable $sgpr38_sgpr39, undef renamable $sgpr46_sgpr47, implicit-def dead $scc - ; CHECK-NEXT: renamable $sgpr46_sgpr47 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 killed renamable $sgpr12_sgpr13, undef renamable $sgpr54_sgpr55, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr54_sgpr55 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr12_sgpr13 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: ; CHECK-NEXT: successors: %bb.7(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr40_sgpr41, implicit $exec + ; CHECK-NEXT: dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7: ; CHECK-NEXT: successors: %bb.8(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr48_sgpr49 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $sgpr50_sgpr51 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $sgpr64_sgpr65 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $sgpr66_sgpr67 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec ; CHECK-NEXT: dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V32_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY1]], undef $sgpr14, 11, implicit-def $m0, implicit $m0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.8: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.9(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr64_sgpr65, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.10, implicit $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.9: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY renamable $sgpr60_sgpr61, implicit $exec + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, [[COPY2]], undef renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s64), addrspace 1) - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr18_sgpr19, implicit $exec ; CHECK-NEXT: dead renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec - ; CHECK-NEXT: renamable $sgpr58 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr82 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: renamable $sgpr52_sgpr53 = COPY killed renamable $sgpr4_sgpr5 - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY renamable $sgpr52_sgpr53 - ; CHECK-NEXT: renamable $sgpr54_sgpr55 = COPY killed renamable $sgpr6_sgpr7 - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY renamable $sgpr54_sgpr55 - ; CHECK-NEXT: renamable $sgpr56_sgpr57 = COPY killed renamable $sgpr10_sgpr11 - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY renamable $sgpr56_sgpr57 + ; CHECK-NEXT: renamable $sgpr68_sgpr69 = COPY killed renamable $sgpr4_sgpr5 + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY renamable $sgpr68_sgpr69 + ; CHECK-NEXT: renamable $sgpr70_sgpr71 = COPY killed renamable $sgpr6_sgpr7 + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY renamable $sgpr70_sgpr71 + ; CHECK-NEXT: renamable $sgpr80_sgpr81 = COPY killed renamable $sgpr10_sgpr11 + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY renamable $sgpr80_sgpr81 ; CHECK-NEXT: $sgpr12 = COPY renamable $sgpr14 ; CHECK-NEXT: $sgpr13 = COPY renamable $sgpr15 - ; CHECK-NEXT: renamable $sgpr62 = COPY killed renamable $sgpr8 + ; CHECK-NEXT: renamable $sgpr84 = COPY killed renamable $sgpr8 ; CHECK-NEXT: renamable $sgpr33 = COPY killed renamable $sgpr16 - ; CHECK-NEXT: renamable $sgpr59 = COPY killed renamable $sgpr15 - ; CHECK-NEXT: renamable $sgpr63 = COPY killed renamable $sgpr14 + ; CHECK-NEXT: renamable $sgpr83 = COPY killed renamable $sgpr15 + ; CHECK-NEXT: renamable $sgpr85 = COPY killed renamable $sgpr14 + ; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr18_sgpr19 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr58_sgpr59 + ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9 - ; CHECK-NEXT: renamable $sgpr14 = COPY killed renamable $sgpr63 - ; CHECK-NEXT: renamable $sgpr15 = COPY killed renamable $sgpr59 + ; CHECK-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr48_sgpr49 + ; CHECK-NEXT: renamable $sgpr14 = COPY killed renamable $sgpr85 + ; CHECK-NEXT: renamable $sgpr15 = COPY killed renamable $sgpr83 ; CHECK-NEXT: renamable $sgpr16 = COPY killed renamable $sgpr33 - ; CHECK-NEXT: renamable $sgpr4_sgpr5 = COPY killed renamable $sgpr52_sgpr53 - ; CHECK-NEXT: renamable $sgpr6_sgpr7 = COPY killed renamable $sgpr54_sgpr55 - ; CHECK-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr62 - ; CHECK-NEXT: renamable $sgpr10_sgpr11 = COPY killed renamable $sgpr56_sgpr57 + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = COPY killed renamable $sgpr68_sgpr69 + ; CHECK-NEXT: renamable $sgpr6_sgpr7 = COPY killed renamable $sgpr70_sgpr71 + ; CHECK-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr84 + ; CHECK-NEXT: renamable $sgpr10_sgpr11 = COPY killed renamable $sgpr80_sgpr81 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $exec = S_MOV_B64_term renamable $sgpr50_sgpr51 + ; CHECK-NEXT: $exec = S_MOV_B64_term renamable $sgpr66_sgpr67 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.10, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.10: ; CHECK-NEXT: successors: %bb.8(0x40000000), %bb.12(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.12 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.11: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.12: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.13(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr46_sgpr47 + ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr54_sgpr55 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.13: ; CHECK-NEXT: successors: %bb.15(0x40000000), %bb.14(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr42_sgpr43, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.15, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.14 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.14: ; CHECK-NEXT: successors: %bb.15(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.15: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.16(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x0000000F00000000 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr36_sgpr37, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.11, implicit $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.16: diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll index e357b7f6a7fb5..62f50e7b887fe 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 3 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-attributor,amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefixes=CHECK,TABLE %s -; this needs rework downstream ; FIXME: Work around update_test_checks bug in constant expression handling by manually deleting part of the last global pattern @function.lds = addrspace(3) global i16 poison @@ -180,7 +179,7 @@ define internal void @mutual_recursion_1(i16 %arg) { define amdgpu_kernel void @kernel_lds_recursion() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_lds_recursion( ; CHECK-SAME: ) #[[ATTR5:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META9:![0-9]+]] { -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds) ] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds) ], !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]] ; CHECK-NEXT: call void @mutual_recursion_0(i16 0) ; CHECK-NEXT: ret void ; @@ -195,9 +194,9 @@ define amdgpu_kernel void @kernel_lds_recursion() { ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-lds-size"="4" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-lds-size"="4" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } ; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. @@ -211,6 +210,11 @@ define amdgpu_kernel void @kernel_lds_recursion() { ; CHECK: [[META7]] = !{[[META8:![0-9]+]]} ; CHECK: [[META8]] = distinct !{[[META8]], [[META6]]} ; CHECK: [[META9]] = !{i32 2} +; CHECK: [[META10]] = !{[[META11:![0-9]+]]} +; CHECK: [[META11]] = distinct !{[[META11]], [[META12:![0-9]+]]} +; CHECK: [[META12]] = distinct !{[[META12]]} +; CHECK: [[META13]] = !{[[META14:![0-9]+]]} +; CHECK: [[META14]] = distinct !{[[META14]], [[META12]]} ;. ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; TABLE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll index 554e3640221b9..b78cbb0ac29cf 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll @@ -19,12 +19,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) { ; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 ; SI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] ; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8 ; SI-SDAG-NEXT: s_mov_b32 s2, 0x3ff00000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -106,10 +106,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) { ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 ; VI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -199,12 +199,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) { ; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 ; SI-SDAG-NEXT: v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1] ; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], |s[0:1]|, v0 -; SI-SDAG-NEXT: s_and_b64 s[0:1], s[2:3], exec -; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8 ; SI-SDAG-NEXT: s_mov_b32 s2, 0x3ff00000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -286,10 +286,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) { ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 ; VI-SDAG-NEXT: v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-SDAG-NEXT: s_and_b64 s[2:3], s[2:3], exec +; VI-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], |s[0:1]|, v0 -; VI-SDAG-NEXT: s_and_b64 s[0:1], s[2:3], exec ; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -380,12 +380,12 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) { ; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 ; SI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] ; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8 ; SI-SDAG-NEXT: s_mov_b32 s2, 0xbff00000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -467,10 +467,10 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) { ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 ; VI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -560,12 +560,12 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) { ; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 9 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1] ; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], -s[0:1], v0 -; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8 ; SI-SDAG-NEXT: s_mov_b32 s2, 0xbff00000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -647,10 +647,10 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) { ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 9 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], -s[0:1], v0 -; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -741,12 +741,12 @@ define double @v_rsq_f64(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 @@ -827,8 +827,8 @@ define double @v_rsq_f64(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -910,12 +910,12 @@ define double @v_rsq_f64_fabs(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 @@ -996,8 +996,8 @@ define double @v_rsq_f64_fabs(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -1080,12 +1080,12 @@ define double @v_rsq_f64_missing_contract0(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 @@ -1166,8 +1166,8 @@ define double @v_rsq_f64_missing_contract0(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -1249,12 +1249,12 @@ define double @v_rsq_f64_missing_contract1(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 @@ -1335,8 +1335,8 @@ define double @v_rsq_f64_missing_contract1(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -1418,12 +1418,12 @@ define double @v_neg_rsq_f64(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 @@ -1504,8 +1504,8 @@ define double @v_neg_rsq_f64(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -1588,23 +1588,22 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5] ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 ; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] ; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 -; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] ; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11] ; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] @@ -1743,45 +1742,44 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] ; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] -; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] -; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] ; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 -; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 ; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 -; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 ; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] -; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] ; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] -; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] -; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] ; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] ; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] ; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] ; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; VI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 -; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] ; VI-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 ; VI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 ; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 -; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; VI-SDAG-NEXT: v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] ; VI-SDAG-NEXT: v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], 1.0 ; VI-SDAG-NEXT: v_div_scale_f64 v[17:18], s[4:5], 1.0, v[2:3], 1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[9:10], v[5:6] @@ -1890,23 +1888,22 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5] ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 ; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] ; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 -; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] ; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11] ; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] @@ -2045,45 +2042,44 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] ; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] -; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] -; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] ; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 -; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 ; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 -; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 ; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] -; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] ; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] -; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] -; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] ; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] ; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] ; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] ; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; VI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 -; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] ; VI-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 ; VI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 ; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 -; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; VI-SDAG-NEXT: v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], -1.0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] ; VI-SDAG-NEXT: v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], -1.0 ; VI-SDAG-NEXT: v_div_scale_f64 v[17:18], s[4:5], -1.0, v[2:3], -1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[9:10], v[5:6] @@ -2191,12 +2187,12 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 @@ -2315,8 +2311,8 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -2434,23 +2430,22 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5] ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 ; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] ; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 -; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] -; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 -; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] ; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11] ; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] @@ -2592,45 +2587,44 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] ; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] -; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] -; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 +; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] ; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 -; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 ; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5 -; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5 ; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] -; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] ; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] -; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] -; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] ; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9] ; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] ; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] ; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; VI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 -; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] ; VI-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 +; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 ; VI-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9 ; VI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 -; VI-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; VI-SDAG-NEXT: v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], -1.0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] ; VI-SDAG-NEXT: v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], 1.0 ; VI-SDAG-NEXT: v_div_scale_f64 v[17:18], s[4:5], 1.0, v[2:3], 1.0 ; VI-SDAG-NEXT: v_rcp_f64_e32 v[9:10], v[5:6] @@ -2738,12 +2732,12 @@ define double @v_rsq_f64_fneg_fabs(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 9 ; SI-SDAG-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 @@ -2824,8 +2818,8 @@ define double @v_rsq_f64_fneg_fabs(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 9 ; VI-SDAG-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -2909,12 +2903,12 @@ define double @v_rsq_f64__afn_sqrt(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 @@ -2995,8 +2989,8 @@ define double @v_rsq_f64__afn_sqrt(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -3078,12 +3072,12 @@ define double @v_rsq_f64__afn_fdiv(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 @@ -3148,8 +3142,8 @@ define double @v_rsq_f64__afn_fdiv(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -3223,12 +3217,12 @@ define double @v_rsq_f64__afn(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 @@ -3293,8 +3287,8 @@ define double @v_rsq_f64__afn(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -3368,12 +3362,12 @@ define double @v_neg_rsq_f64__afn(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 @@ -3439,8 +3433,8 @@ define double @v_neg_rsq_f64__afn(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -3515,12 +3509,12 @@ define double @v_rsq_f64__afn_ninf(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 @@ -3585,8 +3579,8 @@ define double @v_rsq_f64__afn_ninf(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -3660,12 +3654,12 @@ define double @v_rsq_f64__afn_nnan(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 @@ -3730,8 +3724,8 @@ define double @v_rsq_f64__afn_nnan(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -3805,12 +3799,12 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 @@ -3875,8 +3869,8 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -3950,12 +3944,12 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 @@ -4021,8 +4015,8 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -4097,12 +4091,12 @@ define double @v_rsq_f64__nnan_ninf(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 @@ -4183,8 +4177,8 @@ define double @v_rsq_f64__nnan_ninf(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -4266,40 +4260,39 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-SDAG-NEXT: v_mov_b32_e32 v12, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v12, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SI-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; SI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] ; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80 ; SI-SDAG-NEXT: v_mov_b32_e32 v15, 0x260 -; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 ; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5] ; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3] ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7] ; SI-SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[0:1] ; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[6:7] ; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[0:1], v[8:9] ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5 ; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v12 ; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] ; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[6:7], v[6:7], v[0:1] -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[12:13], v[8:9], v[6:7] -; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1] +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7] ; SI-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] ; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 @@ -4391,11 +4384,10 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) { ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] ; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] @@ -4417,8 +4409,8 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) { ; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1] ; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9] ; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; VI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; VI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 @@ -4519,12 +4511,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 { ; SI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 ; SI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] ; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x260 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 +; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v8 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 @@ -4590,10 +4582,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 { ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 ; VI-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -4675,12 +4667,12 @@ define double @v_rsq_f64_unsafe(double %x) #0 { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 @@ -4745,8 +4737,8 @@ define double @v_rsq_f64_unsafe(double %x) #0 { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] @@ -5074,24 +5066,24 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; SI-SDAG-NEXT: v_mov_b32_e32 v10, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SI-SDAG-NEXT: v_mov_b32_e32 v11, 0x260 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_mov_b32_e32 v10, 0xffffff80 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v11 +; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x260 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v6 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] @@ -5158,8 +5150,8 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] @@ -5241,24 +5233,24 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; SI-SDAG-NEXT: v_mov_b32_e32 v10, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SI-SDAG-NEXT: v_mov_b32_e32 v11, 0x260 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_mov_b32_e32 v10, 0xffffff80 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v11 +; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x260 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v6 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] @@ -5325,8 +5317,8 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] @@ -5408,24 +5400,24 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; SI-SDAG-NEXT: v_mov_b32_e32 v10, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SI-SDAG-NEXT: v_mov_b32_e32 v11, 0x260 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7] +; SI-SDAG-NEXT: v_mov_b32_e32 v10, 0xffffff80 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3] ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v11 +; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x260 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v6 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] @@ -5492,8 +5484,8 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x100 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] ; VI-SDAG-NEXT: v_mul_f64 v[6:7], v[2:3], v[4:5] @@ -5575,17 +5567,17 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ; SI-SDAG-NEXT: s_mov_b32 s4, 0 ; SI-SDAG-NEXT: s_brev_b32 s5, 8 ; SI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: s_mov_b32 s7, 0x40700000 -; SI-SDAG-NEXT: s_mov_b32 s8, 0x40700000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 +; SI-SDAG-NEXT: s_mov_b32 s8, 0x40700000 ; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] ; SI-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] @@ -5665,10 +5657,10 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_brev_b32 s5, 8 ; VI-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x100 ; VI-SDAG-NEXT: s_mov_b32 s4, 0 ; VI-SDAG-NEXT: s_mov_b32 s5, 0x40700000 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; VI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; VI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll index 83a077f7f74db..1dcc6a19c29d7 100644 --- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll @@ -20,7 +20,6 @@ define void @func1() { ; GFX12-SDAG-NEXT: s_mov_b32 m0, 3 ; GFX12-SDAG-NEXT: s_barrier_join m0 ; GFX12-SDAG-NEXT: s_barrier_wait 1 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: func1: @@ -35,7 +34,6 @@ define void @func1() { ; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_barrier_join 3 ; GFX12-GISEL-NEXT: s_barrier_wait 1 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) @@ -57,7 +55,6 @@ define void @func2() { ; GFX12-SDAG-NEXT: s_mov_b32 m0, 1 ; GFX12-SDAG-NEXT: s_barrier_join m0 ; GFX12-SDAG-NEXT: s_barrier_wait 1 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: func2: @@ -72,7 +69,6 @@ define void @func2() { ; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_barrier_join 1 ; GFX12-GISEL-NEXT: s_barrier_wait 1 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) @@ -94,11 +90,10 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshr_b32 s2, s2, 4 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_and_b32 s2, s2, 63 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_or_b32 s3, 0x90000, s2 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX12-SDAG-NEXT: s_barrier_init m0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002 @@ -117,16 +112,13 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG-NEXT: s_get_barrier_state s2, m0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3] -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_sext_i32_i16 s3, s3 -; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func1@gotpcrel32@lo+12 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func1@gotpcrel32@hi+24 +; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func1@gotpcrel32@lo+8 +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func1@gotpcrel32@hi+16 ; GFX12-SDAG-NEXT: s_barrier_signal -1 ; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-SDAG-NEXT: s_barrier_wait -1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe @@ -136,7 +128,6 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func2@gotpcrel32@hi+24 ; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX12-SDAG-NEXT: s_get_barrier_state s0, -1 ; GFX12-SDAG-NEXT: s_endpgm @@ -154,11 +145,10 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshr_b32 s0, s0, 4 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_and_b32 s0, s0, 63 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_or_b32 s1, s0, 0x90000 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: s_mov_b32 m0, s1 ; GFX12-GISEL-NEXT: s_barrier_init m0 ; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0002 @@ -178,16 +168,13 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1] -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1 -; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func1@gotpcrel32@lo+12 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func1@gotpcrel32@hi+24 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func1@gotpcrel32@lo+8 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func1@gotpcrel32@hi+16 ; GFX12-GISEL-NEXT: s_barrier_signal -1 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-GISEL-NEXT: s_barrier_wait -1 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0 @@ -199,7 +186,6 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func2@gotpcrel32@hi+24 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX12-GISEL-NEXT: s_get_barrier_state s0, -1 ; GFX12-GISEL-NEXT: s_endpgm @@ -226,11 +212,9 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX12-SDAG-NEXT: s_getpc_b64 s[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_sext_i32_i16 s7, s7 -; GFX12-SDAG-NEXT: s_add_co_u32 s6, s6, func2@gotpcrel32@lo+12 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_add_co_ci_u32 s7, s7, func2@gotpcrel32@hi+24 +; GFX12-SDAG-NEXT: s_add_co_u32 s6, s6, func2@gotpcrel32@lo+8 +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s7, s7, func2@gotpcrel32@hi+16 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0 ; GFX12-SDAG-NEXT: s_load_b64 s[12:13], s[6:7], 0x0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70002 @@ -243,7 +227,6 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX12-SDAG-NEXT: s_barrier_join m0 ; GFX12-SDAG-NEXT: s_barrier_wait 1 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe ; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-SDAG-NEXT: s_endpgm ; @@ -252,11 +235,9 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_add_co_u32 s8, s4, 48 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s5, 0 ; GFX12-GISEL-NEXT: s_getpc_b64 s[4:5] -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_sext_i32_i16 s5, s5 -; GFX12-GISEL-NEXT: s_add_co_u32 s4, s4, func2@gotpcrel32@lo+12 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s5, s5, func2@gotpcrel32@hi+24 +; GFX12-GISEL-NEXT: s_add_co_u32 s4, s4, func2@gotpcrel32@lo+8 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s5, s5, func2@gotpcrel32@hi+16 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0 ; GFX12-GISEL-NEXT: s_load_b64 s[12:13], s[4:5], 0x0 ; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] @@ -268,7 +249,6 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_barrier_join 2 ; GFX12-GISEL-NEXT: s_barrier_wait 1 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-GISEL-NEXT: s_endpgm call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7) diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll index 144a5f4b009a7..8010c1286ea98 100644 --- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll +++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll @@ -86,7 +86,6 @@ define void @test_remat_s_getpc_b64() { ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i64 @llvm.amdgcn.s.getpc() diff --git a/llvm/test/CodeGen/AMDGPU/sched-no-schedmodel.mir b/llvm/test/CodeGen/AMDGPU/sched-no-schedmodel.mir new file mode 100644 index 0000000000000..09b326c5a63a1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sched-no-schedmodel.mir @@ -0,0 +1,50 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -misched-cluster=false --misched-prera-direction=topdown -run-pass=machine-scheduler --schedmodel=1 -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -misched-cluster=false --misched-prera-direction=topdown -run-pass=machine-scheduler --schedmodel=0 -o - %s | FileCheck -check-prefix=GCN-NO-SCHEDMODEL %s + +--- +name: sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE +tracksRegLiveness: true +body: | + bb.0: + + ; GCN-LABEL: name: sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE + ; GCN: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: early-clobber %2:vreg_512_align2 = contract V_MFMA_F32_32X32X16_FP8_FP8_vgprcd_e64 [[DEF]].sub0_sub1, [[DEF1]].sub0_sub1, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: dead [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF2]], 0, 0, implicit $exec + ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: dead [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF3]], 0, 0, implicit $exec + ; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: dead [[DS_READ_U16_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF4]], 0, 0, implicit $exec + ; GCN-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 %2.sub0, %2.sub1, implicit $exec + ; GCN-NEXT: early-clobber %3:vreg_512_align2 = contract V_MFMA_F32_32X32X16_FP8_FP8_vgprcd_e64 [[DEF]].sub0_sub1, [[DEF1]].sub0_sub1, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit %2, implicit %3, implicit [[V_MUL_LO_U32_e64_]] + ; + ; GCN-NO-SCHEDMODEL-LABEL: name: sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE + ; GCN-NO-SCHEDMODEL: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; GCN-NO-SCHEDMODEL-NEXT: [[DEF1:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; GCN-NO-SCHEDMODEL-NEXT: early-clobber %2:vreg_512_align2 = contract V_MFMA_F32_32X32X16_FP8_FP8_vgprcd_e64 [[DEF]].sub0_sub1, [[DEF1]].sub0_sub1, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NO-SCHEDMODEL-NEXT: early-clobber %3:vreg_512_align2 = contract V_MFMA_F32_32X32X16_FP8_FP8_vgprcd_e64 [[DEF]].sub0_sub1, [[DEF1]].sub0_sub1, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NO-SCHEDMODEL-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 %2.sub0, %2.sub1, implicit $exec + ; GCN-NO-SCHEDMODEL-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NO-SCHEDMODEL-NEXT: dead [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF2]], 0, 0, implicit $exec + ; GCN-NO-SCHEDMODEL-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NO-SCHEDMODEL-NEXT: dead [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF3]], 0, 0, implicit $exec + ; GCN-NO-SCHEDMODEL-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NO-SCHEDMODEL-NEXT: dead [[DS_READ_U16_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF4]], 0, 0, implicit $exec + ; GCN-NO-SCHEDMODEL-NEXT: S_ENDPGM 0, implicit %2, implicit %3, implicit [[V_MUL_LO_U32_e64_]] + %0:vreg_128_align2 = IMPLICIT_DEF + %1:vreg_128_align2 = IMPLICIT_DEF + %2:vreg_512_align2 = contract V_MFMA_F32_32X32X16_FP8_FP8_vgprcd_e64 %0.sub0_sub1:vreg_128_align2, %1.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec + %3:vreg_512_align2 = contract V_MFMA_F32_32X32X16_FP8_FP8_vgprcd_e64 %0.sub0_sub1:vreg_128_align2, %1.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %2.sub0, %2.sub1, implicit $exec + %5:vgpr_32 = IMPLICIT_DEF + %6:vgpr_32 = DS_READ_U16_gfx9 %5, 0, 0, implicit $exec + %7:vgpr_32 = IMPLICIT_DEF + %8:vgpr_32 = DS_READ_U16_gfx9 %7, 0, 0, implicit $exec + %9:vgpr_32 = IMPLICIT_DEF + %10:vgpr_32 = DS_READ_U16_gfx9 %9, 0, 0, implicit $exec + S_ENDPGM 0, implicit %2, implicit %3, implicit %4 +... diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll index 79187f51af0d2..04ea2117d5f57 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll @@ -1,4 +1,4 @@ -; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack -amdgpu-use-amdgpu-trackers=1 2>&1 < %s | FileCheck -check-prefixes=ERR-GCNTRACKERS %s +; xUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack -amdgpu-use-amdgpu-trackers=1 2>&1 < %s | FileCheck -check-prefixes=ERR-GCNTRACKERS %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack 2>&1 < %s | FileCheck -check-prefixes=GCN %s %asm.output = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll index b24b73967147f..63bc4f5c38445 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll @@ -245,10 +245,10 @@ bb3: ; CHECK-LABEL: {{^}}spill_func: ; GCN: NumSgprs: 104 ; GCN-GCNTRACKERS: NumSgprs: 104 -; GCN: NumVgprs: 3 -; GCN-GCNTRACKERS: NumVgprs: 4 -; GCN: ScratchSize: 12 -; GCN-GCNTRACKERS: ScratchSize: 16 +; GCN: NumVgprs: 2 +; GCN-GCNTRACKERS: NumVgprs: 3 +; GCN: ScratchSize: 8 +; GCN-GCNTRACKERS: ScratchSize: 12 define void @spill_func(ptr addrspace(1) %arg) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir index 34d203e0de2ff..aa0d1fe45e9a8 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir @@ -23,7 +23,7 @@ # GCN-GCNTRACKER: S_ENDPGM # When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage. However, the RP is still high, -# and RA is unable to allocate without spills. By running the high-RP-reschedule schedule we would have furhter decreased RP, which provides increased +# and RA is unable to allocate without spills. By running the high-RP-reschedule schedule we would have furhter decreased RP, which provides increased # flexibility for RA. --- @@ -72,7 +72,7 @@ body: | %33:vgpr_32 = V_CVT_U32_F32_e64 0, %32, 0, 0, implicit $mode, implicit $exec undef %34.sub0:sgpr_256 = S_MOV_B32 0 %35:sreg_32 = S_SUB_I32 0, %29, implicit-def dead $scc - %36:sreg_32 = V_READFIRSTLANE_B32 %33, implicit $exec + %36:sreg_32_xm0 = V_READFIRSTLANE_B32 %33, implicit $exec %37:sreg_32 = S_MUL_I32 %35, %36 %38:sreg_32 = S_MUL_HI_U32 %36, %37 %39:sreg_32 = S_ADD_I32 %36, %38, implicit-def dead $scc @@ -99,7 +99,7 @@ body: | %58:vgpr_32 = V_MUL_F32_e64 0, 1333788670, 0, %57, 0, 0, implicit $mode, implicit $exec %59:vgpr_32 = V_CVT_U32_F32_e64 0, %58, 0, 0, implicit $mode, implicit $exec %60:sreg_32 = S_SUB_I32 0, %55, implicit-def dead $scc - %61:sreg_32 = V_READFIRSTLANE_B32 %59, implicit $exec + %61:sreg_32_xm0 = V_READFIRSTLANE_B32 %59, implicit $exec %62:sreg_32 = S_MUL_I32 %60, %61 %63:sreg_32 = S_MUL_HI_U32 %61, %62 %64:sreg_32 = S_ADD_I32 %61, %63, implicit-def dead $scc @@ -127,7 +127,7 @@ body: | %84:vgpr_32 = V_MUL_F32_e64 0, 1333788670, 0, %83, 0, 0, implicit $mode, implicit $exec %85:vgpr_32 = V_CVT_U32_F32_e64 0, %84, 0, 0, implicit $mode, implicit $exec %86:sreg_32 = S_SUB_I32 0, %81, implicit-def dead $scc - %87:sreg_32 = V_READFIRSTLANE_B32 %85, implicit $exec + %87:sreg_32_xm0 = V_READFIRSTLANE_B32 %85, implicit $exec %88:sreg_32 = S_MUL_I32 %86, %87 %89:sreg_32 = S_MUL_HI_U32 %87, %88 %90:sreg_32 = S_ADD_I32 %87, %89, implicit-def dead $scc @@ -175,7 +175,7 @@ body: | %127:vgpr_32 = V_MUL_F32_e64 0, 1333788670, 0, %126, 0, 0, implicit $mode, implicit $exec %128:vgpr_32 = V_CVT_U32_F32_e64 0, %127, 0, 0, implicit $mode, implicit $exec %129:sreg_32 = S_SUB_I32 0, %124, implicit-def dead $scc - %130:sreg_32 = V_READFIRSTLANE_B32 %128, implicit $exec + %130:sreg_32_xm0 = V_READFIRSTLANE_B32 %128, implicit $exec %131:sreg_32 = S_MUL_I32 %129, %130 %132:sreg_32 = S_MUL_HI_U32 %130, %131 %133:sreg_32 = S_ADD_I32 %130, %132, implicit-def dead $scc diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll index 1e5d6755fbc85..bd1258cb1cf98 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll @@ -42,4 +42,4 @@ bb2: declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone } -attributes #1 = { "amdgpu-num-vgpr"="9" "amdgpu-flat-work-group-size"="1024,1024" } +attributes #1 = { "amdgpu-num-vgpr"="9" } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll index 2c9d24ee04ebf..462ac23ec7e0e 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -1,7 +1,11 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s ; SI-MINREG: NumSgprs: {{[1-9]$}} ; SI-MINREG: NumVgprs: {{[1-9]$}} @@ -10,8 +14,12 @@ ; SI-MAXOCC: NumVgprs: {{[1-4]?[0-9]$}} ; stores may alias loads -; VI: NumSgprs: {{[0-9]$}} -; VI: NumVgprs: {{[1-3][0-9]$}} +; VI-MINREG: NumSgprs: {{[0-9]$}} +; VI-MINREG: NumVgprs: {{[1-3][0-9]$}} + +; stores may alias loads +; VI-MAXOCC: NumSgprs: {{[1-3][0-9]$}} +; VI-MAXOCC: NumVgprs: {{[1-6][0-9]$}} define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %in_arg, ptr addrspace(1) nocapture %out_arg) { bb: diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll index 96b40bca5e2e3..ef24996d00274 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll @@ -1,5 +1,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=MISCHED %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-sched-strategy=iterative-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s ; Test the scheduler when only one wave is requested. The result should be high register usage and max ILP. diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll index 8cb1d250a6fa7..118c47e680709 100644 --- a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll @@ -15,9 +15,8 @@ define amdgpu_ps void @_amdgpu_ps_main(float %arg) { ; GFX900-NEXT: s_mov_b32 s0, 0 ; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX900-NEXT: ; implicit-def: $vgpr0 -; GFX900-NEXT: ; implicit-def: $sgpr2 -; GFX900-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX900-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX900-NEXT: s_xor_b64 s[6:7], exec, s[2:3] ; GFX900-NEXT: s_cbranch_execz .LBB0_2 ; GFX900-NEXT: ; %bb.1: ; %bb1 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 @@ -33,12 +32,11 @@ define amdgpu_ps void @_amdgpu_ps_main(float %arg) { ; GFX900-NEXT: s_mov_b32 s14, s0 ; GFX900-NEXT: s_mov_b32 s15, s0 ; GFX900-NEXT: image_sample v[0:1], v[0:1], s[8:15], s[0:3] dmask:0x3 -; GFX900-NEXT: s_mov_b32 s2, 1.0 ; GFX900-NEXT: .LBB0_2: ; %Flow ; GFX900-NEXT: s_or_saveexec_b64 s[0:1], s[6:7] ; GFX900-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX900-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX900-NEXT: v_mov_b32_e32 v2, s2 +; GFX900-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX900-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX900-NEXT: s_cbranch_execz .LBB0_5 ; GFX900-NEXT: ; %bb.3: ; %bb5 diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll index 50a3336a7483c..9a168c133c552 100644 --- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll @@ -30,6 +30,7 @@ define float @v_test_fmin_legacy_ule_f32_safe(float %a, float %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule float %a, %b @@ -59,6 +60,7 @@ define float @v_test_fmin_legacy_ule_f32_nnan_flag(float %a, float %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule float %a, %b @@ -88,6 +90,7 @@ define float @v_test_fmin_legacy_ule_f32_nsz_flag(float %a, float %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule float %a, %b @@ -144,6 +147,7 @@ define float @v_test_fmax_legacy_uge_f32_safe(float %a, float %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge float %a, %b @@ -173,6 +177,7 @@ define float @v_test_fmax_legacy_uge_f32_nnan_flag(float %a, float %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge float %a, %b @@ -202,6 +207,7 @@ define float @v_test_fmax_legacy_uge_f32_nsz_flag(float %a, float %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge float %a, %b @@ -261,8 +267,10 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_safe(<2 x float> %a, <2 x float ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x float> %a, %b @@ -295,8 +303,10 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_flag(<2 x float> %a, <2 x ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x float> %a, %b @@ -329,8 +339,10 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_nsz_flag(<2 x float> %a, <2 x f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x float> %a, %b @@ -363,8 +375,10 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag(<2 x float> %a, < ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x float> %a, %b @@ -397,8 +411,10 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_safe(<2 x float> %a, <2 x float ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x float> %a, %b @@ -431,8 +447,10 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_flag(<2 x float> %a, <2 x ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x float> %a, %b @@ -465,8 +483,10 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_nsz_flag(<2 x float> %a, <2 x f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x float> %a, %b @@ -499,8 +519,10 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag(<2 x float> %a, < ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x float> %a, %b @@ -534,6 +556,7 @@ define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b @@ -567,6 +590,7 @@ define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b @@ -600,6 +624,7 @@ define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b @@ -664,6 +689,7 @@ define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge half %a, %b @@ -697,6 +723,7 @@ define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge half %a, %b @@ -730,6 +757,7 @@ define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge half %a, %b @@ -806,11 +834,14 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b @@ -856,11 +887,14 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b @@ -906,11 +940,14 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b @@ -992,11 +1029,14 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x half> %a, %b @@ -1042,11 +1082,14 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x half> %a, %b @@ -1092,11 +1135,14 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX12-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp uge <2 x half> %a, %b @@ -1197,14 +1243,18 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 @@ -1272,14 +1322,18 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 @@ -1347,14 +1401,18 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 @@ -1470,14 +1528,18 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 @@ -1545,14 +1607,18 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 @@ -1620,14 +1686,18 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v5, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v7, v6 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v3 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index f5af93abacf9b..ac9bb27b1c1c8 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -1903,19 +1903,14 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-LABEL: v_vselect_v16f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v31, s30, 0 -; VI-NEXT: v_writelane_b32 v31, s31, 1 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; VI-NEXT: v_cmp_eq_u32_e64 s[18:19], 0, v17 -; VI-NEXT: v_cmp_eq_u32_e64 s[30:31], 0, v29 +; VI-NEXT: v_cmp_eq_u32_e64 s[40:41], 0, v29 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 ; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 ; VI-NEXT: v_cmp_eq_u32_e64 s[28:29], 0, v27 -; VI-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[30:31] +; VI-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[40:41] ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 ; VI-NEXT: v_cmp_eq_u32_e64 s[20:21], 0, v19 @@ -1957,8 +1952,6 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v28 ; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[14:15] ; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[16:17] -; VI-NEXT: v_readlane_b32 s30, v31, 0 -; VI-NEXT: v_readlane_b32 s31, v31, 1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc @@ -1976,10 +1969,6 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_vselect_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir index 27904eac2ed63..09e25075e51c5 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir @@ -263,6 +263,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec ; VMEM-GFX8-NEXT: $exec = S_MOV_B64 1 @@ -348,6 +380,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec ; VMEM-GFX8-NEXT: $exec = S_MOV_B64 1 @@ -436,6 +500,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec ; VMEM-GFX8-NEXT: $exec = S_MOV_B64 3 @@ -522,6 +618,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec ; VMEM-GFX8-NEXT: $exec = S_MOV_B64 3 @@ -614,6 +742,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec ; VMEM-GFX8-NEXT: $exec = S_MOV_B64 1 @@ -710,6 +870,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec ; VMEM-GFX8-NEXT: $exec = S_MOV_B64 1 diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll index 56365b3c3d712..17a58a1323f02 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX942 %s define void @v_shuffle_v2i64_v8i64__u_u(ptr addrspace(1) inreg %ptr) { @@ -38,16 +38,16 @@ define void @v_shuffle_v2i64_v8i64__0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -77,16 +77,16 @@ define void @v_shuffle_v2i64_v8i64__1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -116,16 +116,16 @@ define void @v_shuffle_v2i64_v8i64__2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -155,16 +155,16 @@ define void @v_shuffle_v2i64_v8i64__3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -194,16 +194,16 @@ define void @v_shuffle_v2i64_v8i64__4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -233,16 +233,16 @@ define void @v_shuffle_v2i64_v8i64__5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -272,16 +272,16 @@ define void @v_shuffle_v2i64_v8i64__6_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -315,18 +315,18 @@ define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v14 -; GFX940-NEXT: v_mov_b32_e32 v1, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v14 +; GFX942-NEXT: v_mov_b32_e32 v1, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -367,16 +367,16 @@ define void @v_shuffle_v2i64_v8i64__9_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -407,16 +407,16 @@ define void @v_shuffle_v2i64_v8i64__10_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -447,16 +447,16 @@ define void @v_shuffle_v2i64_v8i64__11_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -487,16 +487,16 @@ define void @v_shuffle_v2i64_v8i64__12_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -527,16 +527,16 @@ define void @v_shuffle_v2i64_v8i64__13_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -567,16 +567,16 @@ define void @v_shuffle_v2i64_v8i64__14_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -611,18 +611,18 @@ define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v14 -; GFX940-NEXT: v_mov_b32_e32 v1, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v14 +; GFX942-NEXT: v_mov_b32_e32 v1, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -667,24 +667,24 @@ define void @v_shuffle_v2i64_v8i64__15_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v16 -; GFX940-NEXT: v_mov_b32_e32 v3, v17 -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v18, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v16 +; GFX942-NEXT: v_mov_b32_e32 v3, v17 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v18, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -725,22 +725,22 @@ define void @v_shuffle_v2i64_v8i64__15_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v18 -; GFX940-NEXT: v_mov_b32_e32 v1, v19 -; GFX940-NEXT: global_store_dwordx4 v20, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v18 +; GFX942-NEXT: v_mov_b32_e32 v1, v19 +; GFX942-NEXT: global_store_dwordx4 v20, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -781,22 +781,22 @@ define void @v_shuffle_v2i64_v8i64__15_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v20 -; GFX940-NEXT: v_mov_b32_e32 v3, v21 -; GFX940-NEXT: global_store_dwordx4 v22, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v20 +; GFX942-NEXT: v_mov_b32_e32 v3, v21 +; GFX942-NEXT: global_store_dwordx4 v22, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -837,22 +837,22 @@ define void @v_shuffle_v2i64_v8i64__15_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v22 -; GFX940-NEXT: v_mov_b32_e32 v5, v23 -; GFX940-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v22 +; GFX942-NEXT: v_mov_b32_e32 v5, v23 +; GFX942-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -893,22 +893,22 @@ define void @v_shuffle_v2i64_v8i64__15_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v24 -; GFX940-NEXT: v_mov_b32_e32 v7, v25 -; GFX940-NEXT: global_store_dwordx4 v26, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v24 +; GFX942-NEXT: v_mov_b32_e32 v7, v25 +; GFX942-NEXT: global_store_dwordx4 v26, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -949,22 +949,22 @@ define void @v_shuffle_v2i64_v8i64__15_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v26 -; GFX940-NEXT: v_mov_b32_e32 v9, v27 -; GFX940-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v26 +; GFX942-NEXT: v_mov_b32_e32 v9, v27 +; GFX942-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -1005,22 +1005,22 @@ define void @v_shuffle_v2i64_v8i64__15_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v28 -; GFX940-NEXT: v_mov_b32_e32 v11, v29 -; GFX940-NEXT: global_store_dwordx4 v30, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v28 +; GFX942-NEXT: v_mov_b32_e32 v11, v29 +; GFX942-NEXT: global_store_dwordx4 v30, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -1061,22 +1061,22 @@ define void @v_shuffle_v2i64_v8i64__15_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v30 -; GFX940-NEXT: v_mov_b32_e32 v13, v31 -; GFX940-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v30 +; GFX942-NEXT: v_mov_b32_e32 v13, v31 +; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -1115,20 +1115,20 @@ define void @v_shuffle_v2i64_v8i64__15_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v14 -; GFX940-NEXT: v_mov_b32_e32 v3, v15 -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v14 +; GFX942-NEXT: v_mov_b32_e32 v3, v15 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -1163,18 +1163,18 @@ define void @v_shuffle_v2i64_v8i64__15_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v14 -; GFX940-NEXT: v_mov_b32_e32 v1, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v14 +; GFX942-NEXT: v_mov_b32_e32 v1, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -1209,18 +1209,18 @@ define void @v_shuffle_v2i64_v8i64__15_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v14 -; GFX940-NEXT: v_mov_b32_e32 v3, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v14 +; GFX942-NEXT: v_mov_b32_e32 v3, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -1255,18 +1255,18 @@ define void @v_shuffle_v2i64_v8i64__15_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v14 -; GFX940-NEXT: v_mov_b32_e32 v5, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v14 +; GFX942-NEXT: v_mov_b32_e32 v5, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -1301,18 +1301,18 @@ define void @v_shuffle_v2i64_v8i64__15_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v14 -; GFX940-NEXT: v_mov_b32_e32 v7, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v14 +; GFX942-NEXT: v_mov_b32_e32 v7, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -1347,18 +1347,18 @@ define void @v_shuffle_v2i64_v8i64__15_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v14 -; GFX940-NEXT: v_mov_b32_e32 v9, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -1393,18 +1393,18 @@ define void @v_shuffle_v2i64_v8i64__15_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v14 -; GFX940-NEXT: v_mov_b32_e32 v11, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v14 +; GFX942-NEXT: v_mov_b32_e32 v11, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -1439,18 +1439,18 @@ define void @v_shuffle_v2i64_v8i64__15_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v14 -; GFX940-NEXT: v_mov_b32_e32 v13, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__15_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v14 +; GFX942-NEXT: v_mov_b32_e32 v13, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -1485,18 +1485,18 @@ define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -1530,18 +1530,18 @@ define void @v_shuffle_v2i64_v8i64__0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> zeroinitializer store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -1575,18 +1575,18 @@ define void @v_shuffle_v2i64_v8i64__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -1620,18 +1620,18 @@ define void @v_shuffle_v2i64_v8i64__2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v0 -; GFX940-NEXT: v_mov_b32_e32 v7, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -1665,18 +1665,18 @@ define void @v_shuffle_v2i64_v8i64__3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v0 -; GFX940-NEXT: v_mov_b32_e32 v9, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -1710,18 +1710,18 @@ define void @v_shuffle_v2i64_v8i64__4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v0 -; GFX940-NEXT: v_mov_b32_e32 v11, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -1755,18 +1755,18 @@ define void @v_shuffle_v2i64_v8i64__5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v0 -; GFX940-NEXT: v_mov_b32_e32 v13, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -1800,18 +1800,18 @@ define void @v_shuffle_v2i64_v8i64__6_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v0 -; GFX940-NEXT: v_mov_b32_e32 v15, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -1849,20 +1849,20 @@ define void @v_shuffle_v2i64_v8i64__7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v14 -; GFX940-NEXT: v_mov_b32_e32 v3, v15 -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v14 +; GFX942-NEXT: v_mov_b32_e32 v3, v15 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -1896,18 +1896,18 @@ define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -1947,22 +1947,22 @@ define void @v_shuffle_v2i64_v8i64__9_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v0 -; GFX940-NEXT: v_mov_b32_e32 v7, v1 -; GFX940-NEXT: global_store_dwordx4 v18, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v18, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -2003,22 +2003,22 @@ define void @v_shuffle_v2i64_v8i64__10_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v0 -; GFX940-NEXT: v_mov_b32_e32 v9, v1 -; GFX940-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -2059,22 +2059,22 @@ define void @v_shuffle_v2i64_v8i64__11_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v0 -; GFX940-NEXT: v_mov_b32_e32 v11, v1 -; GFX940-NEXT: global_store_dwordx4 v18, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: global_store_dwordx4 v18, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -2115,22 +2115,22 @@ define void @v_shuffle_v2i64_v8i64__12_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v0 -; GFX940-NEXT: v_mov_b32_e32 v13, v1 -; GFX940-NEXT: global_store_dwordx4 v18, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v1 +; GFX942-NEXT: global_store_dwordx4 v18, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -2171,22 +2171,22 @@ define void @v_shuffle_v2i64_v8i64__13_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v0 -; GFX940-NEXT: v_mov_b32_e32 v15, v1 -; GFX940-NEXT: global_store_dwordx4 v18, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v1 +; GFX942-NEXT: global_store_dwordx4 v18, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -2227,22 +2227,22 @@ define void @v_shuffle_v2i64_v8i64__14_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v16, v0 -; GFX940-NEXT: v_mov_b32_e32 v17, v1 -; GFX940-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v1 +; GFX942-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -2273,16 +2273,16 @@ define void @v_shuffle_v2i64_v8i64__u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -2312,16 +2312,16 @@ define void @v_shuffle_v2i64_v8i64__0_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -2355,18 +2355,18 @@ define void @v_shuffle_v2i64_v8i64__1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -2400,18 +2400,18 @@ define void @v_shuffle_v2i64_v8i64__2_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -2445,18 +2445,18 @@ define void @v_shuffle_v2i64_v8i64__3_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v2 -; GFX940-NEXT: v_mov_b32_e32 v9, v3 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -2490,18 +2490,18 @@ define void @v_shuffle_v2i64_v8i64__4_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v2 -; GFX940-NEXT: v_mov_b32_e32 v11, v3 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -2535,18 +2535,18 @@ define void @v_shuffle_v2i64_v8i64__5_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v2 -; GFX940-NEXT: v_mov_b32_e32 v13, v3 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-NEXT: v_mov_b32_e32 v13, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -2580,18 +2580,18 @@ define void @v_shuffle_v2i64_v8i64__6_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v2 -; GFX940-NEXT: v_mov_b32_e32 v15, v3 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v2 +; GFX942-NEXT: v_mov_b32_e32 v15, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -2625,18 +2625,18 @@ define void @v_shuffle_v2i64_v8i64__7_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v14 -; GFX940-NEXT: v_mov_b32_e32 v1, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v14 +; GFX942-NEXT: v_mov_b32_e32 v1, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -2666,16 +2666,16 @@ define void @v_shuffle_v2i64_v8i64__8_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -2715,22 +2715,22 @@ define void @v_shuffle_v2i64_v8i64__9_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v2 -; GFX940-NEXT: v_mov_b32_e32 v9, v3 -; GFX940-NEXT: global_store_dwordx4 v20, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v20, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -2771,22 +2771,22 @@ define void @v_shuffle_v2i64_v8i64__10_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v2 -; GFX940-NEXT: v_mov_b32_e32 v11, v3 -; GFX940-NEXT: global_store_dwordx4 v20, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v20, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -2827,22 +2827,22 @@ define void @v_shuffle_v2i64_v8i64__11_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v2 -; GFX940-NEXT: v_mov_b32_e32 v13, v3 -; GFX940-NEXT: global_store_dwordx4 v20, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-NEXT: v_mov_b32_e32 v13, v3 +; GFX942-NEXT: global_store_dwordx4 v20, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -2883,22 +2883,22 @@ define void @v_shuffle_v2i64_v8i64__12_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v2 -; GFX940-NEXT: v_mov_b32_e32 v15, v3 -; GFX940-NEXT: global_store_dwordx4 v20, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v2 +; GFX942-NEXT: v_mov_b32_e32 v15, v3 +; GFX942-NEXT: global_store_dwordx4 v20, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -2939,22 +2939,22 @@ define void @v_shuffle_v2i64_v8i64__13_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v16, v2 -; GFX940-NEXT: v_mov_b32_e32 v17, v3 -; GFX940-NEXT: global_store_dwordx4 v20, v[14:17], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v2 +; GFX942-NEXT: v_mov_b32_e32 v17, v3 +; GFX942-NEXT: global_store_dwordx4 v20, v[14:17], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -2995,22 +2995,22 @@ define void @v_shuffle_v2i64_v8i64__14_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v18, v2 -; GFX940-NEXT: v_mov_b32_e32 v19, v3 -; GFX940-NEXT: global_store_dwordx4 v20, v[16:19], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v18, v2 +; GFX942-NEXT: v_mov_b32_e32 v19, v3 +; GFX942-NEXT: global_store_dwordx4 v20, v[16:19], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -3041,16 +3041,16 @@ define void @v_shuffle_v2i64_v8i64__u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3084,18 +3084,18 @@ define void @v_shuffle_v2i64_v8i64__0_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v4 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3125,16 +3125,16 @@ define void @v_shuffle_v2i64_v8i64__1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3168,18 +3168,18 @@ define void @v_shuffle_v2i64_v8i64__2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v4 -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3213,18 +3213,18 @@ define void @v_shuffle_v2i64_v8i64__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v4 -; GFX940-NEXT: v_mov_b32_e32 v9, v5 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3258,18 +3258,18 @@ define void @v_shuffle_v2i64_v8i64__4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v4 -; GFX940-NEXT: v_mov_b32_e32 v11, v5 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3303,18 +3303,18 @@ define void @v_shuffle_v2i64_v8i64__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v4 -; GFX940-NEXT: v_mov_b32_e32 v13, v5 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3348,18 +3348,18 @@ define void @v_shuffle_v2i64_v8i64__6_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v4 -; GFX940-NEXT: v_mov_b32_e32 v15, v5 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v4 +; GFX942-NEXT: v_mov_b32_e32 v15, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3393,18 +3393,18 @@ define void @v_shuffle_v2i64_v8i64__7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v14 -; GFX940-NEXT: v_mov_b32_e32 v3, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v14 +; GFX942-NEXT: v_mov_b32_e32 v3, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3434,16 +3434,16 @@ define void @v_shuffle_v2i64_v8i64__8_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3483,22 +3483,22 @@ define void @v_shuffle_v2i64_v8i64__9_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v4 -; GFX940-NEXT: v_mov_b32_e32 v11, v5 -; GFX940-NEXT: global_store_dwordx4 v22, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v22, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -3539,22 +3539,22 @@ define void @v_shuffle_v2i64_v8i64__10_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v4 -; GFX940-NEXT: v_mov_b32_e32 v13, v5 -; GFX940-NEXT: global_store_dwordx4 v22, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v5 +; GFX942-NEXT: global_store_dwordx4 v22, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -3595,22 +3595,22 @@ define void @v_shuffle_v2i64_v8i64__11_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v4 -; GFX940-NEXT: v_mov_b32_e32 v15, v5 -; GFX940-NEXT: global_store_dwordx4 v22, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v4 +; GFX942-NEXT: v_mov_b32_e32 v15, v5 +; GFX942-NEXT: global_store_dwordx4 v22, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -3651,22 +3651,22 @@ define void @v_shuffle_v2i64_v8i64__12_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v16, v4 -; GFX940-NEXT: v_mov_b32_e32 v17, v5 -; GFX940-NEXT: global_store_dwordx4 v22, v[14:17], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v4 +; GFX942-NEXT: v_mov_b32_e32 v17, v5 +; GFX942-NEXT: global_store_dwordx4 v22, v[14:17], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -3707,22 +3707,22 @@ define void @v_shuffle_v2i64_v8i64__13_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v18, v4 -; GFX940-NEXT: v_mov_b32_e32 v19, v5 -; GFX940-NEXT: global_store_dwordx4 v22, v[16:19], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v18, v4 +; GFX942-NEXT: v_mov_b32_e32 v19, v5 +; GFX942-NEXT: global_store_dwordx4 v22, v[16:19], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -3763,22 +3763,22 @@ define void @v_shuffle_v2i64_v8i64__14_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v20, v4 -; GFX940-NEXT: v_mov_b32_e32 v21, v5 -; GFX940-NEXT: global_store_dwordx4 v22, v[18:21], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v20, v4 +; GFX942-NEXT: v_mov_b32_e32 v21, v5 +; GFX942-NEXT: global_store_dwordx4 v22, v[18:21], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -3809,16 +3809,16 @@ define void @v_shuffle_v2i64_v8i64__u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3852,18 +3852,18 @@ define void @v_shuffle_v2i64_v8i64__0_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v6 -; GFX940-NEXT: v_mov_b32_e32 v3, v7 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3897,18 +3897,18 @@ define void @v_shuffle_v2i64_v8i64__1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v6 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3938,16 +3938,16 @@ define void @v_shuffle_v2i64_v8i64__2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -3981,18 +3981,18 @@ define void @v_shuffle_v2i64_v8i64__3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v6 -; GFX940-NEXT: v_mov_b32_e32 v9, v7 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4026,18 +4026,18 @@ define void @v_shuffle_v2i64_v8i64__4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v6 -; GFX940-NEXT: v_mov_b32_e32 v11, v7 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v6 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4071,18 +4071,18 @@ define void @v_shuffle_v2i64_v8i64__5_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v6 -; GFX940-NEXT: v_mov_b32_e32 v13, v7 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v6 +; GFX942-NEXT: v_mov_b32_e32 v13, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4116,18 +4116,18 @@ define void @v_shuffle_v2i64_v8i64__6_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v6 -; GFX940-NEXT: v_mov_b32_e32 v15, v7 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v6 +; GFX942-NEXT: v_mov_b32_e32 v15, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4161,18 +4161,18 @@ define void @v_shuffle_v2i64_v8i64__7_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v14 -; GFX940-NEXT: v_mov_b32_e32 v5, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v14 +; GFX942-NEXT: v_mov_b32_e32 v5, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4202,16 +4202,16 @@ define void @v_shuffle_v2i64_v8i64__8_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4251,22 +4251,22 @@ define void @v_shuffle_v2i64_v8i64__9_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v6 -; GFX940-NEXT: v_mov_b32_e32 v13, v7 -; GFX940-NEXT: global_store_dwordx4 v24, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v6 +; GFX942-NEXT: v_mov_b32_e32 v13, v7 +; GFX942-NEXT: global_store_dwordx4 v24, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -4307,22 +4307,22 @@ define void @v_shuffle_v2i64_v8i64__10_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v6 -; GFX940-NEXT: v_mov_b32_e32 v15, v7 -; GFX940-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v6 +; GFX942-NEXT: v_mov_b32_e32 v15, v7 +; GFX942-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -4363,22 +4363,22 @@ define void @v_shuffle_v2i64_v8i64__11_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v16, v6 -; GFX940-NEXT: v_mov_b32_e32 v17, v7 -; GFX940-NEXT: global_store_dwordx4 v24, v[14:17], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v6 +; GFX942-NEXT: v_mov_b32_e32 v17, v7 +; GFX942-NEXT: global_store_dwordx4 v24, v[14:17], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -4419,22 +4419,22 @@ define void @v_shuffle_v2i64_v8i64__12_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v18, v6 -; GFX940-NEXT: v_mov_b32_e32 v19, v7 -; GFX940-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v18, v6 +; GFX942-NEXT: v_mov_b32_e32 v19, v7 +; GFX942-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -4475,22 +4475,22 @@ define void @v_shuffle_v2i64_v8i64__13_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v20, v6 -; GFX940-NEXT: v_mov_b32_e32 v21, v7 -; GFX940-NEXT: global_store_dwordx4 v24, v[18:21], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v20, v6 +; GFX942-NEXT: v_mov_b32_e32 v21, v7 +; GFX942-NEXT: global_store_dwordx4 v24, v[18:21], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -4531,22 +4531,22 @@ define void @v_shuffle_v2i64_v8i64__14_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v22, v6 -; GFX940-NEXT: v_mov_b32_e32 v23, v7 -; GFX940-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v22, v6 +; GFX942-NEXT: v_mov_b32_e32 v23, v7 +; GFX942-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -4577,16 +4577,16 @@ define void @v_shuffle_v2i64_v8i64__u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4620,18 +4620,18 @@ define void @v_shuffle_v2i64_v8i64__0_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v8 -; GFX940-NEXT: v_mov_b32_e32 v3, v9 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v8 +; GFX942-NEXT: v_mov_b32_e32 v3, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4665,18 +4665,18 @@ define void @v_shuffle_v2i64_v8i64__1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v8 -; GFX940-NEXT: v_mov_b32_e32 v5, v9 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4710,18 +4710,18 @@ define void @v_shuffle_v2i64_v8i64__2_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v8 -; GFX940-NEXT: v_mov_b32_e32 v7, v9 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4751,16 +4751,16 @@ define void @v_shuffle_v2i64_v8i64__3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4794,18 +4794,18 @@ define void @v_shuffle_v2i64_v8i64__4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v8 -; GFX940-NEXT: v_mov_b32_e32 v11, v9 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4839,18 +4839,18 @@ define void @v_shuffle_v2i64_v8i64__5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v8 -; GFX940-NEXT: v_mov_b32_e32 v13, v9 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v8 +; GFX942-NEXT: v_mov_b32_e32 v13, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4884,18 +4884,18 @@ define void @v_shuffle_v2i64_v8i64__6_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v8 -; GFX940-NEXT: v_mov_b32_e32 v15, v9 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v8 +; GFX942-NEXT: v_mov_b32_e32 v15, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4929,18 +4929,18 @@ define void @v_shuffle_v2i64_v8i64__7_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v14 -; GFX940-NEXT: v_mov_b32_e32 v7, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v14 +; GFX942-NEXT: v_mov_b32_e32 v7, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -4970,16 +4970,16 @@ define void @v_shuffle_v2i64_v8i64__8_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -5019,22 +5019,22 @@ define void @v_shuffle_v2i64_v8i64__9_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v8 -; GFX940-NEXT: v_mov_b32_e32 v15, v9 -; GFX940-NEXT: global_store_dwordx4 v26, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v8 +; GFX942-NEXT: v_mov_b32_e32 v15, v9 +; GFX942-NEXT: global_store_dwordx4 v26, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -5075,22 +5075,22 @@ define void @v_shuffle_v2i64_v8i64__10_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v16, v8 -; GFX940-NEXT: v_mov_b32_e32 v17, v9 -; GFX940-NEXT: global_store_dwordx4 v26, v[14:17], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v8 +; GFX942-NEXT: v_mov_b32_e32 v17, v9 +; GFX942-NEXT: global_store_dwordx4 v26, v[14:17], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -5131,22 +5131,22 @@ define void @v_shuffle_v2i64_v8i64__11_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v18, v8 -; GFX940-NEXT: v_mov_b32_e32 v19, v9 -; GFX940-NEXT: global_store_dwordx4 v26, v[16:19], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v18, v8 +; GFX942-NEXT: v_mov_b32_e32 v19, v9 +; GFX942-NEXT: global_store_dwordx4 v26, v[16:19], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -5187,22 +5187,22 @@ define void @v_shuffle_v2i64_v8i64__12_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v20, v8 -; GFX940-NEXT: v_mov_b32_e32 v21, v9 -; GFX940-NEXT: global_store_dwordx4 v26, v[18:21], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v20, v8 +; GFX942-NEXT: v_mov_b32_e32 v21, v9 +; GFX942-NEXT: global_store_dwordx4 v26, v[18:21], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -5243,22 +5243,22 @@ define void @v_shuffle_v2i64_v8i64__13_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v22, v8 -; GFX940-NEXT: v_mov_b32_e32 v23, v9 -; GFX940-NEXT: global_store_dwordx4 v26, v[20:23], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v22, v8 +; GFX942-NEXT: v_mov_b32_e32 v23, v9 +; GFX942-NEXT: global_store_dwordx4 v26, v[20:23], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -5299,22 +5299,22 @@ define void @v_shuffle_v2i64_v8i64__14_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v24, v8 -; GFX940-NEXT: v_mov_b32_e32 v25, v9 -; GFX940-NEXT: global_store_dwordx4 v26, v[22:25], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v24, v8 +; GFX942-NEXT: v_mov_b32_e32 v25, v9 +; GFX942-NEXT: global_store_dwordx4 v26, v[22:25], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -5345,16 +5345,16 @@ define void @v_shuffle_v2i64_v8i64__u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -5388,18 +5388,18 @@ define void @v_shuffle_v2i64_v8i64__0_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v10 -; GFX940-NEXT: v_mov_b32_e32 v3, v11 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v10 +; GFX942-NEXT: v_mov_b32_e32 v3, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -5433,18 +5433,18 @@ define void @v_shuffle_v2i64_v8i64__1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v10 -; GFX940-NEXT: v_mov_b32_e32 v5, v11 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v10 +; GFX942-NEXT: v_mov_b32_e32 v5, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -5478,18 +5478,18 @@ define void @v_shuffle_v2i64_v8i64__2_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v10 -; GFX940-NEXT: v_mov_b32_e32 v7, v11 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -5523,18 +5523,18 @@ define void @v_shuffle_v2i64_v8i64__3_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v10 -; GFX940-NEXT: v_mov_b32_e32 v9, v11 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -5564,16 +5564,16 @@ define void @v_shuffle_v2i64_v8i64__4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -5607,18 +5607,18 @@ define void @v_shuffle_v2i64_v8i64__5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v10 -; GFX940-NEXT: v_mov_b32_e32 v13, v11 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -5652,18 +5652,18 @@ define void @v_shuffle_v2i64_v8i64__6_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v10 -; GFX940-NEXT: v_mov_b32_e32 v15, v11 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v10 +; GFX942-NEXT: v_mov_b32_e32 v15, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -5697,18 +5697,18 @@ define void @v_shuffle_v2i64_v8i64__7_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v14 -; GFX940-NEXT: v_mov_b32_e32 v9, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -5738,16 +5738,16 @@ define void @v_shuffle_v2i64_v8i64__8_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -5787,22 +5787,22 @@ define void @v_shuffle_v2i64_v8i64__9_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v16, v10 -; GFX940-NEXT: v_mov_b32_e32 v17, v11 -; GFX940-NEXT: global_store_dwordx4 v28, v[14:17], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v10 +; GFX942-NEXT: v_mov_b32_e32 v17, v11 +; GFX942-NEXT: global_store_dwordx4 v28, v[14:17], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -5843,22 +5843,22 @@ define void @v_shuffle_v2i64_v8i64__10_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v18, v10 -; GFX940-NEXT: v_mov_b32_e32 v19, v11 -; GFX940-NEXT: global_store_dwordx4 v28, v[16:19], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v18, v10 +; GFX942-NEXT: v_mov_b32_e32 v19, v11 +; GFX942-NEXT: global_store_dwordx4 v28, v[16:19], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -5899,22 +5899,22 @@ define void @v_shuffle_v2i64_v8i64__11_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v20, v10 -; GFX940-NEXT: v_mov_b32_e32 v21, v11 -; GFX940-NEXT: global_store_dwordx4 v28, v[18:21], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v20, v10 +; GFX942-NEXT: v_mov_b32_e32 v21, v11 +; GFX942-NEXT: global_store_dwordx4 v28, v[18:21], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -5955,22 +5955,22 @@ define void @v_shuffle_v2i64_v8i64__12_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v22, v10 -; GFX940-NEXT: v_mov_b32_e32 v23, v11 -; GFX940-NEXT: global_store_dwordx4 v28, v[20:23], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v22, v10 +; GFX942-NEXT: v_mov_b32_e32 v23, v11 +; GFX942-NEXT: global_store_dwordx4 v28, v[20:23], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -6011,22 +6011,22 @@ define void @v_shuffle_v2i64_v8i64__13_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v24, v10 -; GFX940-NEXT: v_mov_b32_e32 v25, v11 -; GFX940-NEXT: global_store_dwordx4 v28, v[22:25], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v24, v10 +; GFX942-NEXT: v_mov_b32_e32 v25, v11 +; GFX942-NEXT: global_store_dwordx4 v28, v[22:25], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -6067,22 +6067,22 @@ define void @v_shuffle_v2i64_v8i64__14_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v26, v10 -; GFX940-NEXT: v_mov_b32_e32 v27, v11 -; GFX940-NEXT: global_store_dwordx4 v28, v[24:27], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v26, v10 +; GFX942-NEXT: v_mov_b32_e32 v27, v11 +; GFX942-NEXT: global_store_dwordx4 v28, v[24:27], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -6113,16 +6113,16 @@ define void @v_shuffle_v2i64_v8i64__u_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -6156,18 +6156,18 @@ define void @v_shuffle_v2i64_v8i64__0_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v12 -; GFX940-NEXT: v_mov_b32_e32 v3, v13 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v12 +; GFX942-NEXT: v_mov_b32_e32 v3, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -6201,18 +6201,18 @@ define void @v_shuffle_v2i64_v8i64__1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v12 -; GFX940-NEXT: v_mov_b32_e32 v5, v13 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v12 +; GFX942-NEXT: v_mov_b32_e32 v5, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -6246,18 +6246,18 @@ define void @v_shuffle_v2i64_v8i64__2_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v12 -; GFX940-NEXT: v_mov_b32_e32 v7, v13 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v12 +; GFX942-NEXT: v_mov_b32_e32 v7, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -6291,18 +6291,18 @@ define void @v_shuffle_v2i64_v8i64__3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v12 -; GFX940-NEXT: v_mov_b32_e32 v9, v13 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -6336,18 +6336,18 @@ define void @v_shuffle_v2i64_v8i64__4_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v12 -; GFX940-NEXT: v_mov_b32_e32 v11, v13 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v12 +; GFX942-NEXT: v_mov_b32_e32 v11, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -6377,16 +6377,16 @@ define void @v_shuffle_v2i64_v8i64__5_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -6420,18 +6420,18 @@ define void @v_shuffle_v2i64_v8i64__6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v12 -; GFX940-NEXT: v_mov_b32_e32 v15, v13 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v12 +; GFX942-NEXT: v_mov_b32_e32 v15, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -6465,18 +6465,18 @@ define void @v_shuffle_v2i64_v8i64__7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v14 -; GFX940-NEXT: v_mov_b32_e32 v11, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v14 +; GFX942-NEXT: v_mov_b32_e32 v11, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -6506,16 +6506,16 @@ define void @v_shuffle_v2i64_v8i64__8_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -6555,22 +6555,22 @@ define void @v_shuffle_v2i64_v8i64__9_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v18, v12 -; GFX940-NEXT: v_mov_b32_e32 v19, v13 -; GFX940-NEXT: global_store_dwordx4 v30, v[16:19], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v18, v12 +; GFX942-NEXT: v_mov_b32_e32 v19, v13 +; GFX942-NEXT: global_store_dwordx4 v30, v[16:19], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -6611,22 +6611,22 @@ define void @v_shuffle_v2i64_v8i64__10_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v20, v12 -; GFX940-NEXT: v_mov_b32_e32 v21, v13 -; GFX940-NEXT: global_store_dwordx4 v30, v[18:21], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v20, v12 +; GFX942-NEXT: v_mov_b32_e32 v21, v13 +; GFX942-NEXT: global_store_dwordx4 v30, v[18:21], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -6667,22 +6667,22 @@ define void @v_shuffle_v2i64_v8i64__11_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v22, v12 -; GFX940-NEXT: v_mov_b32_e32 v23, v13 -; GFX940-NEXT: global_store_dwordx4 v30, v[20:23], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v22, v12 +; GFX942-NEXT: v_mov_b32_e32 v23, v13 +; GFX942-NEXT: global_store_dwordx4 v30, v[20:23], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -6723,22 +6723,22 @@ define void @v_shuffle_v2i64_v8i64__12_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v24, v12 -; GFX940-NEXT: v_mov_b32_e32 v25, v13 -; GFX940-NEXT: global_store_dwordx4 v30, v[22:25], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v24, v12 +; GFX942-NEXT: v_mov_b32_e32 v25, v13 +; GFX942-NEXT: global_store_dwordx4 v30, v[22:25], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -6779,22 +6779,22 @@ define void @v_shuffle_v2i64_v8i64__13_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v26, v12 -; GFX940-NEXT: v_mov_b32_e32 v27, v13 -; GFX940-NEXT: global_store_dwordx4 v30, v[24:27], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v26, v12 +; GFX942-NEXT: v_mov_b32_e32 v27, v13 +; GFX942-NEXT: global_store_dwordx4 v30, v[24:27], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -6835,22 +6835,22 @@ define void @v_shuffle_v2i64_v8i64__14_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v28, v12 -; GFX940-NEXT: v_mov_b32_e32 v29, v13 -; GFX940-NEXT: global_store_dwordx4 v30, v[26:29], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v28, v12 +; GFX942-NEXT: v_mov_b32_e32 v29, v13 +; GFX942-NEXT: global_store_dwordx4 v30, v[26:29], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -6881,16 +6881,16 @@ define void @v_shuffle_v2i64_v8i64__u_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -6924,18 +6924,18 @@ define void @v_shuffle_v2i64_v8i64__0_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v14 -; GFX940-NEXT: v_mov_b32_e32 v3, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v14 +; GFX942-NEXT: v_mov_b32_e32 v3, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -6969,18 +6969,18 @@ define void @v_shuffle_v2i64_v8i64__1_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v14 -; GFX940-NEXT: v_mov_b32_e32 v5, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v14 +; GFX942-NEXT: v_mov_b32_e32 v5, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7014,18 +7014,18 @@ define void @v_shuffle_v2i64_v8i64__2_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v14 -; GFX940-NEXT: v_mov_b32_e32 v7, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v14 +; GFX942-NEXT: v_mov_b32_e32 v7, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7059,18 +7059,18 @@ define void @v_shuffle_v2i64_v8i64__3_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v14 -; GFX940-NEXT: v_mov_b32_e32 v9, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7104,18 +7104,18 @@ define void @v_shuffle_v2i64_v8i64__4_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v14 -; GFX940-NEXT: v_mov_b32_e32 v11, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v14 +; GFX942-NEXT: v_mov_b32_e32 v11, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7149,18 +7149,18 @@ define void @v_shuffle_v2i64_v8i64__5_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v14 -; GFX940-NEXT: v_mov_b32_e32 v13, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v14 +; GFX942-NEXT: v_mov_b32_e32 v13, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7190,16 +7190,16 @@ define void @v_shuffle_v2i64_v8i64__6_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7233,18 +7233,18 @@ define void @v_shuffle_v2i64_v8i64__7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v14 -; GFX940-NEXT: v_mov_b32_e32 v13, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v14 +; GFX942-NEXT: v_mov_b32_e32 v13, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7274,16 +7274,16 @@ define void @v_shuffle_v2i64_v8i64__8_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7323,22 +7323,22 @@ define void @v_shuffle_v2i64_v8i64__9_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v20, v14 -; GFX940-NEXT: v_mov_b32_e32 v21, v15 -; GFX940-NEXT: global_store_dwordx4 v32, v[18:21], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v20, v14 +; GFX942-NEXT: v_mov_b32_e32 v21, v15 +; GFX942-NEXT: global_store_dwordx4 v32, v[18:21], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -7379,22 +7379,22 @@ define void @v_shuffle_v2i64_v8i64__10_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v22, v14 -; GFX940-NEXT: v_mov_b32_e32 v23, v15 -; GFX940-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v22, v14 +; GFX942-NEXT: v_mov_b32_e32 v23, v15 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -7435,22 +7435,22 @@ define void @v_shuffle_v2i64_v8i64__11_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v24, v14 -; GFX940-NEXT: v_mov_b32_e32 v25, v15 -; GFX940-NEXT: global_store_dwordx4 v32, v[22:25], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v24, v14 +; GFX942-NEXT: v_mov_b32_e32 v25, v15 +; GFX942-NEXT: global_store_dwordx4 v32, v[22:25], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -7491,22 +7491,22 @@ define void @v_shuffle_v2i64_v8i64__12_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v26, v14 -; GFX940-NEXT: v_mov_b32_e32 v27, v15 -; GFX940-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v26, v14 +; GFX942-NEXT: v_mov_b32_e32 v27, v15 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -7547,22 +7547,22 @@ define void @v_shuffle_v2i64_v8i64__13_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v28, v14 -; GFX940-NEXT: v_mov_b32_e32 v29, v15 -; GFX940-NEXT: global_store_dwordx4 v32, v[26:29], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v28, v14 +; GFX942-NEXT: v_mov_b32_e32 v29, v15 +; GFX942-NEXT: global_store_dwordx4 v32, v[26:29], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -7603,22 +7603,22 @@ define void @v_shuffle_v2i64_v8i64__14_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v30, v14 -; GFX940-NEXT: v_mov_b32_e32 v31, v15 -; GFX940-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v30, v14 +; GFX942-NEXT: v_mov_b32_e32 v31, v15 +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -7660,16 +7660,16 @@ define void @v_shuffle_v2i64_v8i64__0_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7699,16 +7699,16 @@ define void @v_shuffle_v2i64_v8i64__1_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7738,16 +7738,16 @@ define void @v_shuffle_v2i64_v8i64__2_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7777,16 +7777,16 @@ define void @v_shuffle_v2i64_v8i64__3_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7816,16 +7816,16 @@ define void @v_shuffle_v2i64_v8i64__4_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7855,16 +7855,16 @@ define void @v_shuffle_v2i64_v8i64__5_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7894,16 +7894,16 @@ define void @v_shuffle_v2i64_v8i64__6_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7937,18 +7937,18 @@ define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v14 -; GFX940-NEXT: v_mov_b32_e32 v1, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v14 +; GFX942-NEXT: v_mov_b32_e32 v1, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 @@ -7993,18 +7993,18 @@ define void @v_shuffle_v2i64_v8i64__9_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8039,18 +8039,18 @@ define void @v_shuffle_v2i64_v8i64__10_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v0 -; GFX940-NEXT: v_mov_b32_e32 v7, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8085,18 +8085,18 @@ define void @v_shuffle_v2i64_v8i64__11_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v0 -; GFX940-NEXT: v_mov_b32_e32 v9, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8131,18 +8131,18 @@ define void @v_shuffle_v2i64_v8i64__12_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v0 -; GFX940-NEXT: v_mov_b32_e32 v11, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8177,18 +8177,18 @@ define void @v_shuffle_v2i64_v8i64__13_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v0 -; GFX940-NEXT: v_mov_b32_e32 v13, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8223,18 +8223,18 @@ define void @v_shuffle_v2i64_v8i64__14_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v0 -; GFX940-NEXT: v_mov_b32_e32 v15, v1 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8265,16 +8265,16 @@ define void @v_shuffle_v2i64_v8i64__u_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8315,22 +8315,22 @@ define void @v_shuffle_v2i64_v8i64__0_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v4 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8371,22 +8371,22 @@ define void @v_shuffle_v2i64_v8i64__1_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v6 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8427,22 +8427,22 @@ define void @v_shuffle_v2i64_v8i64__2_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v8 -; GFX940-NEXT: v_mov_b32_e32 v7, v9 -; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 +; GFX942-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8483,22 +8483,22 @@ define void @v_shuffle_v2i64_v8i64__3_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v10 -; GFX940-NEXT: v_mov_b32_e32 v9, v11 -; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8539,22 +8539,22 @@ define void @v_shuffle_v2i64_v8i64__4_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v12 -; GFX940-NEXT: v_mov_b32_e32 v11, v13 -; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v12 +; GFX942-NEXT: v_mov_b32_e32 v11, v13 +; GFX942-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8595,22 +8595,22 @@ define void @v_shuffle_v2i64_v8i64__5_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v14 -; GFX940-NEXT: v_mov_b32_e32 v13, v15 -; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v14 +; GFX942-NEXT: v_mov_b32_e32 v13, v15 +; GFX942-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8651,22 +8651,22 @@ define void @v_shuffle_v2i64_v8i64__6_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v16 -; GFX940-NEXT: v_mov_b32_e32 v15, v17 -; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v16 +; GFX942-NEXT: v_mov_b32_e32 v15, v17 +; GFX942-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8707,22 +8707,22 @@ define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v16, v14 -; GFX940-NEXT: v_mov_b32_e32 v17, v15 -; GFX940-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v16, v14 +; GFX942-NEXT: v_mov_b32_e32 v17, v15 +; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8753,16 +8753,16 @@ define void @v_shuffle_v2i64_v8i64__8_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8797,18 +8797,18 @@ define void @v_shuffle_v2i64_v8i64__9_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8843,18 +8843,18 @@ define void @v_shuffle_v2i64_v8i64__10_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v2 -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8889,18 +8889,18 @@ define void @v_shuffle_v2i64_v8i64__11_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v2 -; GFX940-NEXT: v_mov_b32_e32 v9, v3 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8935,18 +8935,18 @@ define void @v_shuffle_v2i64_v8i64__12_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v2 -; GFX940-NEXT: v_mov_b32_e32 v11, v3 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -8981,18 +8981,18 @@ define void @v_shuffle_v2i64_v8i64__13_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v2 -; GFX940-NEXT: v_mov_b32_e32 v13, v3 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-NEXT: v_mov_b32_e32 v13, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9027,18 +9027,18 @@ define void @v_shuffle_v2i64_v8i64__14_9(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v2 -; GFX940-NEXT: v_mov_b32_e32 v15, v3 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v2 +; GFX942-NEXT: v_mov_b32_e32 v15, v3 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9069,16 +9069,16 @@ define void @v_shuffle_v2i64_v8i64__u_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9119,22 +9119,22 @@ define void @v_shuffle_v2i64_v8i64__0_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v6 -; GFX940-NEXT: v_mov_b32_e32 v3, v7 -; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9175,22 +9175,22 @@ define void @v_shuffle_v2i64_v8i64__1_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v8 -; GFX940-NEXT: v_mov_b32_e32 v5, v9 -; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9231,22 +9231,22 @@ define void @v_shuffle_v2i64_v8i64__2_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v10 -; GFX940-NEXT: v_mov_b32_e32 v7, v11 -; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 +; GFX942-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9287,22 +9287,22 @@ define void @v_shuffle_v2i64_v8i64__3_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v12 -; GFX940-NEXT: v_mov_b32_e32 v9, v13 -; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 +; GFX942-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9343,22 +9343,22 @@ define void @v_shuffle_v2i64_v8i64__4_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v14 -; GFX940-NEXT: v_mov_b32_e32 v11, v15 -; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v14 +; GFX942-NEXT: v_mov_b32_e32 v11, v15 +; GFX942-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9399,22 +9399,22 @@ define void @v_shuffle_v2i64_v8i64__5_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v16 -; GFX940-NEXT: v_mov_b32_e32 v13, v17 -; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v16 +; GFX942-NEXT: v_mov_b32_e32 v13, v17 +; GFX942-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9455,22 +9455,22 @@ define void @v_shuffle_v2i64_v8i64__6_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v18 -; GFX940-NEXT: v_mov_b32_e32 v15, v19 -; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v18 +; GFX942-NEXT: v_mov_b32_e32 v15, v19 +; GFX942-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9511,22 +9511,22 @@ define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v18, v14 -; GFX940-NEXT: v_mov_b32_e32 v19, v15 -; GFX940-NEXT: global_store_dwordx4 v32, v[18:21], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v18, v14 +; GFX942-NEXT: v_mov_b32_e32 v19, v15 +; GFX942-NEXT: global_store_dwordx4 v32, v[18:21], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9561,18 +9561,18 @@ define void @v_shuffle_v2i64_v8i64__8_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v4 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9603,16 +9603,16 @@ define void @v_shuffle_v2i64_v8i64__9_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9647,18 +9647,18 @@ define void @v_shuffle_v2i64_v8i64__10_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v4 -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9693,18 +9693,18 @@ define void @v_shuffle_v2i64_v8i64__11_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v4 -; GFX940-NEXT: v_mov_b32_e32 v9, v5 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9739,18 +9739,18 @@ define void @v_shuffle_v2i64_v8i64__12_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v4 -; GFX940-NEXT: v_mov_b32_e32 v11, v5 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v4 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9785,18 +9785,18 @@ define void @v_shuffle_v2i64_v8i64__13_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v4 -; GFX940-NEXT: v_mov_b32_e32 v13, v5 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v4 +; GFX942-NEXT: v_mov_b32_e32 v13, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9831,18 +9831,18 @@ define void @v_shuffle_v2i64_v8i64__14_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v4 -; GFX940-NEXT: v_mov_b32_e32 v15, v5 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v4 +; GFX942-NEXT: v_mov_b32_e32 v15, v5 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9873,16 +9873,16 @@ define void @v_shuffle_v2i64_v8i64__u_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9923,22 +9923,22 @@ define void @v_shuffle_v2i64_v8i64__0_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v8 -; GFX940-NEXT: v_mov_b32_e32 v3, v9 -; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v8 +; GFX942-NEXT: v_mov_b32_e32 v3, v9 +; GFX942-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -9979,22 +9979,22 @@ define void @v_shuffle_v2i64_v8i64__1_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v10 -; GFX940-NEXT: v_mov_b32_e32 v5, v11 -; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v10 +; GFX942-NEXT: v_mov_b32_e32 v5, v11 +; GFX942-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10035,22 +10035,22 @@ define void @v_shuffle_v2i64_v8i64__2_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v12 -; GFX940-NEXT: v_mov_b32_e32 v7, v13 -; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v12 +; GFX942-NEXT: v_mov_b32_e32 v7, v13 +; GFX942-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10091,22 +10091,22 @@ define void @v_shuffle_v2i64_v8i64__3_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v14 -; GFX940-NEXT: v_mov_b32_e32 v9, v15 -; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 +; GFX942-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10147,22 +10147,22 @@ define void @v_shuffle_v2i64_v8i64__4_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v16 -; GFX940-NEXT: v_mov_b32_e32 v11, v17 -; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v16 +; GFX942-NEXT: v_mov_b32_e32 v11, v17 +; GFX942-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10203,22 +10203,22 @@ define void @v_shuffle_v2i64_v8i64__5_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v18 -; GFX940-NEXT: v_mov_b32_e32 v13, v19 -; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v18 +; GFX942-NEXT: v_mov_b32_e32 v13, v19 +; GFX942-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10259,22 +10259,22 @@ define void @v_shuffle_v2i64_v8i64__6_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v20 -; GFX940-NEXT: v_mov_b32_e32 v15, v21 -; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v20 +; GFX942-NEXT: v_mov_b32_e32 v15, v21 +; GFX942-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10315,22 +10315,22 @@ define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v20, v14 -; GFX940-NEXT: v_mov_b32_e32 v21, v15 -; GFX940-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v20, v14 +; GFX942-NEXT: v_mov_b32_e32 v21, v15 +; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10365,18 +10365,18 @@ define void @v_shuffle_v2i64_v8i64__8_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v6 -; GFX940-NEXT: v_mov_b32_e32 v3, v7 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10411,18 +10411,18 @@ define void @v_shuffle_v2i64_v8i64__9_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v6 -; GFX940-NEXT: v_mov_b32_e32 v5, v7 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v5, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10453,16 +10453,16 @@ define void @v_shuffle_v2i64_v8i64__10_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10497,18 +10497,18 @@ define void @v_shuffle_v2i64_v8i64__11_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v6 -; GFX940-NEXT: v_mov_b32_e32 v9, v7 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10543,18 +10543,18 @@ define void @v_shuffle_v2i64_v8i64__12_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v6 -; GFX940-NEXT: v_mov_b32_e32 v11, v7 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v6 +; GFX942-NEXT: v_mov_b32_e32 v11, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10589,18 +10589,18 @@ define void @v_shuffle_v2i64_v8i64__13_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v6 -; GFX940-NEXT: v_mov_b32_e32 v13, v7 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v6 +; GFX942-NEXT: v_mov_b32_e32 v13, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10635,18 +10635,18 @@ define void @v_shuffle_v2i64_v8i64__14_11(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v6 -; GFX940-NEXT: v_mov_b32_e32 v15, v7 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v6 +; GFX942-NEXT: v_mov_b32_e32 v15, v7 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10677,16 +10677,16 @@ define void @v_shuffle_v2i64_v8i64__u_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10727,22 +10727,22 @@ define void @v_shuffle_v2i64_v8i64__0_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v10 -; GFX940-NEXT: v_mov_b32_e32 v3, v11 -; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v10 +; GFX942-NEXT: v_mov_b32_e32 v3, v11 +; GFX942-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10783,22 +10783,22 @@ define void @v_shuffle_v2i64_v8i64__1_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v12 -; GFX940-NEXT: v_mov_b32_e32 v5, v13 -; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v12 +; GFX942-NEXT: v_mov_b32_e32 v5, v13 +; GFX942-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10839,22 +10839,22 @@ define void @v_shuffle_v2i64_v8i64__2_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v14 -; GFX940-NEXT: v_mov_b32_e32 v7, v15 -; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v14 +; GFX942-NEXT: v_mov_b32_e32 v7, v15 +; GFX942-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10895,22 +10895,22 @@ define void @v_shuffle_v2i64_v8i64__3_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v16 -; GFX940-NEXT: v_mov_b32_e32 v9, v17 -; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v16 +; GFX942-NEXT: v_mov_b32_e32 v9, v17 +; GFX942-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -10951,22 +10951,22 @@ define void @v_shuffle_v2i64_v8i64__4_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v18 -; GFX940-NEXT: v_mov_b32_e32 v11, v19 -; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v18 +; GFX942-NEXT: v_mov_b32_e32 v11, v19 +; GFX942-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11007,22 +11007,22 @@ define void @v_shuffle_v2i64_v8i64__5_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v20 -; GFX940-NEXT: v_mov_b32_e32 v13, v21 -; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v20 +; GFX942-NEXT: v_mov_b32_e32 v13, v21 +; GFX942-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11063,22 +11063,22 @@ define void @v_shuffle_v2i64_v8i64__6_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v22 -; GFX940-NEXT: v_mov_b32_e32 v15, v23 -; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v22 +; GFX942-NEXT: v_mov_b32_e32 v15, v23 +; GFX942-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11119,22 +11119,22 @@ define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v22, v14 -; GFX940-NEXT: v_mov_b32_e32 v23, v15 -; GFX940-NEXT: global_store_dwordx4 v32, v[22:25], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v22, v14 +; GFX942-NEXT: v_mov_b32_e32 v23, v15 +; GFX942-NEXT: global_store_dwordx4 v32, v[22:25], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11169,18 +11169,18 @@ define void @v_shuffle_v2i64_v8i64__8_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v8 -; GFX940-NEXT: v_mov_b32_e32 v3, v9 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v8 +; GFX942-NEXT: v_mov_b32_e32 v3, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11215,18 +11215,18 @@ define void @v_shuffle_v2i64_v8i64__9_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v8 -; GFX940-NEXT: v_mov_b32_e32 v5, v9 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v8 +; GFX942-NEXT: v_mov_b32_e32 v5, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11261,18 +11261,18 @@ define void @v_shuffle_v2i64_v8i64__10_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v8 -; GFX940-NEXT: v_mov_b32_e32 v7, v9 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v8 +; GFX942-NEXT: v_mov_b32_e32 v7, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11303,16 +11303,16 @@ define void @v_shuffle_v2i64_v8i64__11_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11347,18 +11347,18 @@ define void @v_shuffle_v2i64_v8i64__12_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v8 -; GFX940-NEXT: v_mov_b32_e32 v11, v9 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11393,18 +11393,18 @@ define void @v_shuffle_v2i64_v8i64__13_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v8 -; GFX940-NEXT: v_mov_b32_e32 v13, v9 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v8 +; GFX942-NEXT: v_mov_b32_e32 v13, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11439,18 +11439,18 @@ define void @v_shuffle_v2i64_v8i64__14_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v8 -; GFX940-NEXT: v_mov_b32_e32 v15, v9 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v8 +; GFX942-NEXT: v_mov_b32_e32 v15, v9 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11481,16 +11481,16 @@ define void @v_shuffle_v2i64_v8i64__u_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11531,22 +11531,22 @@ define void @v_shuffle_v2i64_v8i64__0_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v12 -; GFX940-NEXT: v_mov_b32_e32 v3, v13 -; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v12 +; GFX942-NEXT: v_mov_b32_e32 v3, v13 +; GFX942-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11587,22 +11587,22 @@ define void @v_shuffle_v2i64_v8i64__1_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v14 -; GFX940-NEXT: v_mov_b32_e32 v5, v15 -; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v14 +; GFX942-NEXT: v_mov_b32_e32 v5, v15 +; GFX942-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11643,22 +11643,22 @@ define void @v_shuffle_v2i64_v8i64__2_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v16 -; GFX940-NEXT: v_mov_b32_e32 v7, v17 -; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v16 +; GFX942-NEXT: v_mov_b32_e32 v7, v17 +; GFX942-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11699,22 +11699,22 @@ define void @v_shuffle_v2i64_v8i64__3_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v18 -; GFX940-NEXT: v_mov_b32_e32 v9, v19 -; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v18 +; GFX942-NEXT: v_mov_b32_e32 v9, v19 +; GFX942-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11755,22 +11755,22 @@ define void @v_shuffle_v2i64_v8i64__4_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v20 -; GFX940-NEXT: v_mov_b32_e32 v11, v21 -; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v20 +; GFX942-NEXT: v_mov_b32_e32 v11, v21 +; GFX942-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11811,22 +11811,22 @@ define void @v_shuffle_v2i64_v8i64__5_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v22 -; GFX940-NEXT: v_mov_b32_e32 v13, v23 -; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v22 +; GFX942-NEXT: v_mov_b32_e32 v13, v23 +; GFX942-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11867,22 +11867,22 @@ define void @v_shuffle_v2i64_v8i64__6_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v24 -; GFX940-NEXT: v_mov_b32_e32 v15, v25 -; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v24 +; GFX942-NEXT: v_mov_b32_e32 v15, v25 +; GFX942-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11923,22 +11923,22 @@ define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v24, v14 -; GFX940-NEXT: v_mov_b32_e32 v25, v15 -; GFX940-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v24, v14 +; GFX942-NEXT: v_mov_b32_e32 v25, v15 +; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -11973,18 +11973,18 @@ define void @v_shuffle_v2i64_v8i64__8_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v10 -; GFX940-NEXT: v_mov_b32_e32 v3, v11 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v10 +; GFX942-NEXT: v_mov_b32_e32 v3, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12019,18 +12019,18 @@ define void @v_shuffle_v2i64_v8i64__9_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v10 -; GFX940-NEXT: v_mov_b32_e32 v5, v11 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v10 +; GFX942-NEXT: v_mov_b32_e32 v5, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12065,18 +12065,18 @@ define void @v_shuffle_v2i64_v8i64__10_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v10 -; GFX940-NEXT: v_mov_b32_e32 v7, v11 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v10 +; GFX942-NEXT: v_mov_b32_e32 v7, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12111,18 +12111,18 @@ define void @v_shuffle_v2i64_v8i64__11_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v10 -; GFX940-NEXT: v_mov_b32_e32 v9, v11 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v10 +; GFX942-NEXT: v_mov_b32_e32 v9, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12153,16 +12153,16 @@ define void @v_shuffle_v2i64_v8i64__12_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12197,18 +12197,18 @@ define void @v_shuffle_v2i64_v8i64__13_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v10 -; GFX940-NEXT: v_mov_b32_e32 v13, v11 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v10 +; GFX942-NEXT: v_mov_b32_e32 v13, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12243,18 +12243,18 @@ define void @v_shuffle_v2i64_v8i64__14_13(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v10 -; GFX940-NEXT: v_mov_b32_e32 v15, v11 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v10 +; GFX942-NEXT: v_mov_b32_e32 v15, v11 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12285,16 +12285,16 @@ define void @v_shuffle_v2i64_v8i64__u_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12335,22 +12335,22 @@ define void @v_shuffle_v2i64_v8i64__0_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v14 -; GFX940-NEXT: v_mov_b32_e32 v3, v15 -; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v14 +; GFX942-NEXT: v_mov_b32_e32 v3, v15 +; GFX942-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12391,22 +12391,22 @@ define void @v_shuffle_v2i64_v8i64__1_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v16 -; GFX940-NEXT: v_mov_b32_e32 v5, v17 -; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v16 +; GFX942-NEXT: v_mov_b32_e32 v5, v17 +; GFX942-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12447,22 +12447,22 @@ define void @v_shuffle_v2i64_v8i64__2_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v18 -; GFX940-NEXT: v_mov_b32_e32 v7, v19 -; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v18 +; GFX942-NEXT: v_mov_b32_e32 v7, v19 +; GFX942-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12503,22 +12503,22 @@ define void @v_shuffle_v2i64_v8i64__3_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v20 -; GFX940-NEXT: v_mov_b32_e32 v9, v21 -; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v20 +; GFX942-NEXT: v_mov_b32_e32 v9, v21 +; GFX942-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12559,22 +12559,22 @@ define void @v_shuffle_v2i64_v8i64__4_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v22 -; GFX940-NEXT: v_mov_b32_e32 v11, v23 -; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v22 +; GFX942-NEXT: v_mov_b32_e32 v11, v23 +; GFX942-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12615,22 +12615,22 @@ define void @v_shuffle_v2i64_v8i64__5_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v24 -; GFX940-NEXT: v_mov_b32_e32 v13, v25 -; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v24 +; GFX942-NEXT: v_mov_b32_e32 v13, v25 +; GFX942-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12671,22 +12671,22 @@ define void @v_shuffle_v2i64_v8i64__6_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v26 -; GFX940-NEXT: v_mov_b32_e32 v15, v27 -; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v26 +; GFX942-NEXT: v_mov_b32_e32 v15, v27 +; GFX942-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12727,22 +12727,22 @@ define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v26, v14 -; GFX940-NEXT: v_mov_b32_e32 v27, v15 -; GFX940-NEXT: global_store_dwordx4 v32, v[26:29], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v26, v14 +; GFX942-NEXT: v_mov_b32_e32 v27, v15 +; GFX942-NEXT: global_store_dwordx4 v32, v[26:29], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12777,18 +12777,18 @@ define void @v_shuffle_v2i64_v8i64__8_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v12 -; GFX940-NEXT: v_mov_b32_e32 v3, v13 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v12 +; GFX942-NEXT: v_mov_b32_e32 v3, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12823,18 +12823,18 @@ define void @v_shuffle_v2i64_v8i64__9_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v12 -; GFX940-NEXT: v_mov_b32_e32 v5, v13 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v12 +; GFX942-NEXT: v_mov_b32_e32 v5, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12869,18 +12869,18 @@ define void @v_shuffle_v2i64_v8i64__10_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v12 -; GFX940-NEXT: v_mov_b32_e32 v7, v13 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v12 +; GFX942-NEXT: v_mov_b32_e32 v7, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12915,18 +12915,18 @@ define void @v_shuffle_v2i64_v8i64__11_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v12 -; GFX940-NEXT: v_mov_b32_e32 v9, v13 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v12 +; GFX942-NEXT: v_mov_b32_e32 v9, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -12961,18 +12961,18 @@ define void @v_shuffle_v2i64_v8i64__12_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v12 -; GFX940-NEXT: v_mov_b32_e32 v11, v13 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v12 +; GFX942-NEXT: v_mov_b32_e32 v11, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13003,16 +13003,16 @@ define void @v_shuffle_v2i64_v8i64__13_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13047,18 +13047,18 @@ define void @v_shuffle_v2i64_v8i64__14_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v12 -; GFX940-NEXT: v_mov_b32_e32 v15, v13 -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v12 +; GFX942-NEXT: v_mov_b32_e32 v15, v13 +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13089,16 +13089,16 @@ define void @v_shuffle_v2i64_v8i64__u_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__u_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13139,22 +13139,22 @@ define void @v_shuffle_v2i64_v8i64__0_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v18, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:17] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v16 -; GFX940-NEXT: v_mov_b32_e32 v3, v17 -; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__0_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:17] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v16 +; GFX942-NEXT: v_mov_b32_e32 v3, v17 +; GFX942-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13195,22 +13195,22 @@ define void @v_shuffle_v2i64_v8i64__1_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v20, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v18 -; GFX940-NEXT: v_mov_b32_e32 v5, v19 -; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__1_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v18 +; GFX942-NEXT: v_mov_b32_e32 v5, v19 +; GFX942-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13251,22 +13251,22 @@ define void @v_shuffle_v2i64_v8i64__2_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v22, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[6:21] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v20 -; GFX940-NEXT: v_mov_b32_e32 v7, v21 -; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__2_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[6:21] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v20 +; GFX942-NEXT: v_mov_b32_e32 v7, v21 +; GFX942-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13307,22 +13307,22 @@ define void @v_shuffle_v2i64_v8i64__3_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v24, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v22 -; GFX940-NEXT: v_mov_b32_e32 v9, v23 -; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__3_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v22 +; GFX942-NEXT: v_mov_b32_e32 v9, v23 +; GFX942-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13363,22 +13363,22 @@ define void @v_shuffle_v2i64_v8i64__4_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v26, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[10:25] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v24 -; GFX940-NEXT: v_mov_b32_e32 v11, v25 -; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__4_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[10:25] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v24 +; GFX942-NEXT: v_mov_b32_e32 v11, v25 +; GFX942-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13419,22 +13419,22 @@ define void @v_shuffle_v2i64_v8i64__5_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v28, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v26 -; GFX940-NEXT: v_mov_b32_e32 v13, v27 -; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__5_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v26 +; GFX942-NEXT: v_mov_b32_e32 v13, v27 +; GFX942-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13475,22 +13475,22 @@ define void @v_shuffle_v2i64_v8i64__6_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v30, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[14:29] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v14, v28 -; GFX940-NEXT: v_mov_b32_e32 v15, v29 -; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__6_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[14:29] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v14, v28 +; GFX942-NEXT: v_mov_b32_e32 v15, v29 +; GFX942-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13531,22 +13531,22 @@ define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v32, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v28, v14 -; GFX940-NEXT: v_mov_b32_e32 v29, v15 -; GFX940-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__7_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v28, v14 +; GFX942-NEXT: v_mov_b32_e32 v29, v15 +; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13581,18 +13581,18 @@ define void @v_shuffle_v2i64_v8i64__8_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v14 -; GFX940-NEXT: v_mov_b32_e32 v3, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__8_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v14 +; GFX942-NEXT: v_mov_b32_e32 v3, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13627,18 +13627,18 @@ define void @v_shuffle_v2i64_v8i64__9_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v14 -; GFX940-NEXT: v_mov_b32_e32 v5, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__9_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v14 +; GFX942-NEXT: v_mov_b32_e32 v5, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13673,18 +13673,18 @@ define void @v_shuffle_v2i64_v8i64__10_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v6, v14 -; GFX940-NEXT: v_mov_b32_e32 v7, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__10_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v14 +; GFX942-NEXT: v_mov_b32_e32 v7, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13719,18 +13719,18 @@ define void @v_shuffle_v2i64_v8i64__11_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v8, v14 -; GFX940-NEXT: v_mov_b32_e32 v9, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__11_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v14 +; GFX942-NEXT: v_mov_b32_e32 v9, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13765,18 +13765,18 @@ define void @v_shuffle_v2i64_v8i64__12_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v10, v14 -; GFX940-NEXT: v_mov_b32_e32 v11, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__12_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, v14 +; GFX942-NEXT: v_mov_b32_e32 v11, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13811,18 +13811,18 @@ define void @v_shuffle_v2i64_v8i64__13_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: v_mov_b32_e32 v12, v14 -; GFX940-NEXT: v_mov_b32_e32 v13, v15 -; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__13_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, v14 +; GFX942-NEXT: v_mov_b32_e32 v13, v15 +; GFX942-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13853,16 +13853,16 @@ define void @v_shuffle_v2i64_v8i64__14_15(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v16, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_shuffle_v2i64_v8i64__14_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() %vec1 = call <8 x i64> asm "; def $0", "=v"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -13907,17 +13907,17 @@ define void @s_shuffle_v2i64_v8i64__0_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -13951,18 +13951,18 @@ define void @s_shuffle_v2i64_v8i64__1_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -13992,17 +13992,17 @@ define void @s_shuffle_v2i64_v8i64__2_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -14036,18 +14036,18 @@ define void @s_shuffle_v2i64_v8i64__3_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -14081,17 +14081,17 @@ define void @s_shuffle_v2i64_v8i64__4_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -14125,18 +14125,18 @@ define void @s_shuffle_v2i64_v8i64__5_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -14170,18 +14170,18 @@ define void @s_shuffle_v2i64_v8i64__6_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -14215,18 +14215,18 @@ define void @s_shuffle_v2i64_v8i64__7_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -14274,18 +14274,18 @@ define void @s_shuffle_v2i64_v8i64__9_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -14316,17 +14316,17 @@ define void @s_shuffle_v2i64_v8i64__10_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -14361,18 +14361,18 @@ define void @s_shuffle_v2i64_v8i64__11_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -14407,17 +14407,17 @@ define void @s_shuffle_v2i64_v8i64__12_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -14452,18 +14452,18 @@ define void @s_shuffle_v2i64_v8i64__13_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -14498,18 +14498,18 @@ define void @s_shuffle_v2i64_v8i64__14_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -14544,18 +14544,18 @@ define void @s_shuffle_v2i64_v8i64__15_u() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_u: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -14600,24 +14600,24 @@ define void @s_shuffle_v2i64_v8i64__15_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s18 -; GFX940-NEXT: s_mov_b32 s9, s19 -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -14658,22 +14658,22 @@ define void @s_shuffle_v2i64_v8i64__15_1() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s26 -; GFX940-NEXT: s_mov_b32 s9, s27 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s26 +; GFX942-NEXT: s_mov_b32 s9, s27 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -14740,24 +14740,24 @@ define void @s_shuffle_v2i64_v8i64__15_2() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s22 -; GFX940-NEXT: s_mov_b32 s9, s23 -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s22 +; GFX942-NEXT: s_mov_b32 s9, s23 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -14798,22 +14798,22 @@ define void @s_shuffle_v2i64_v8i64__15_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s26 -; GFX940-NEXT: s_mov_b32 s9, s27 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s26 +; GFX942-NEXT: s_mov_b32 s9, s27 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -14880,35 +14880,35 @@ define void @s_shuffle_v2i64_v8i64__15_4() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s30 -; GFX940-NEXT: s_mov_b32 s9, s31 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s30 +; GFX942-NEXT: s_mov_b32 s9, s31 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -14975,22 +14975,22 @@ define void @s_shuffle_v2i64_v8i64__15_5() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s26 -; GFX940-NEXT: s_mov_b32 s9, s27 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s26 +; GFX942-NEXT: s_mov_b32 s9, s27 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -15009,18 +15009,10 @@ define void @s_shuffle_v2i64_v8i64__15_6() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -15034,18 +15026,10 @@ define void @s_shuffle_v2i64_v8i64__15_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -15066,18 +15050,10 @@ define void @s_shuffle_v2i64_v8i64__15_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -15091,18 +15067,10 @@ define void @s_shuffle_v2i64_v8i64__15_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -15113,35 +15081,35 @@ define void @s_shuffle_v2i64_v8i64__15_6() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s30 -; GFX940-NEXT: s_mov_b32 s9, s31 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s30 +; GFX942-NEXT: s_mov_b32 s9, s31 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -15160,18 +15128,10 @@ define void @s_shuffle_v2i64_v8i64__15_7() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -15185,18 +15145,10 @@ define void @s_shuffle_v2i64_v8i64__15_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -15217,18 +15169,10 @@ define void @s_shuffle_v2i64_v8i64__15_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -15242,18 +15186,10 @@ define void @s_shuffle_v2i64_v8i64__15_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -15264,35 +15200,35 @@ define void @s_shuffle_v2i64_v8i64__15_7() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s12, s30 -; GFX940-NEXT: s_mov_b32 s13, s31 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s30 +; GFX942-NEXT: s_mov_b32 s13, s31 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -15331,20 +15267,20 @@ define void @s_shuffle_v2i64_v8i64__15_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -15403,20 +15339,20 @@ define void @s_shuffle_v2i64_v8i64__15_10() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -15497,18 +15433,18 @@ define void @s_shuffle_v2i64_v8i64__15_13() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -15547,20 +15483,20 @@ define void @s_shuffle_v2i64_v8i64__15_14() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -15599,20 +15535,20 @@ define void @s_shuffle_v2i64_v8i64__15_15() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s12, s14 -; GFX940-NEXT: s_mov_b32 s13, s15 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -15647,18 +15583,18 @@ define void @s_shuffle_v2i64_v8i64__u_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -15715,20 +15651,20 @@ define void @s_shuffle_v2i64_v8i64__1_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -15785,20 +15721,20 @@ define void @s_shuffle_v2i64_v8i64__3_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -15836,18 +15772,18 @@ define void @s_shuffle_v2i64_v8i64__4_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -15885,20 +15821,20 @@ define void @s_shuffle_v2i64_v8i64__5_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -15936,20 +15872,20 @@ define void @s_shuffle_v2i64_v8i64__6_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s0 -; GFX940-NEXT: s_mov_b32 s15, s1 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -15987,20 +15923,20 @@ define void @s_shuffle_v2i64_v8i64__7_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -16034,18 +15970,18 @@ define void @s_shuffle_v2i64_v8i64__8_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -16089,24 +16025,24 @@ define void @s_shuffle_v2i64_v8i64__9_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -16125,18 +16061,10 @@ define void @s_shuffle_v2i64_v8i64__10_0() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -16148,18 +16076,10 @@ define void @s_shuffle_v2i64_v8i64__10_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -16180,18 +16100,10 @@ define void @s_shuffle_v2i64_v8i64__10_0() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -16203,18 +16115,10 @@ define void @s_shuffle_v2i64_v8i64__10_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -16225,22 +16129,22 @@ define void @s_shuffle_v2i64_v8i64__10_0() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -16285,24 +16189,24 @@ define void @s_shuffle_v2i64_v8i64__11_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -16347,33 +16251,33 @@ define void @s_shuffle_v2i64_v8i64__12_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s16 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s11, s17 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -16418,24 +16322,24 @@ define void @s_shuffle_v2i64_v8i64__13_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -16480,24 +16384,24 @@ define void @s_shuffle_v2i64_v8i64__14_0() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s18, s0 -; GFX940-NEXT: s_mov_b32 s19, s1 -; GFX940-NEXT: s_mov_b64 s[8:9], s[16:17] -; GFX940-NEXT: s_mov_b64 s[10:11], s[18:19] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s18, s0 +; GFX942-NEXT: s_mov_b32 s19, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[16:17] +; GFX942-NEXT: s_mov_b64 s[10:11], s[18:19] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -16528,17 +16432,17 @@ define void @s_shuffle_v2i64_v8i64__u_1() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -16568,17 +16472,17 @@ define void @s_shuffle_v2i64_v8i64__0_1() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -16673,18 +16577,18 @@ define void @s_shuffle_v2i64_v8i64__4_1() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s2 -; GFX940-NEXT: s_mov_b32 s11, s3 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s2 +; GFX942-NEXT: s_mov_b32 s11, s3 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -16741,20 +16645,20 @@ define void @s_shuffle_v2i64_v8i64__6_1() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s2 -; GFX940-NEXT: s_mov_b32 s15, s3 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s2 +; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -16803,17 +16707,17 @@ define void @s_shuffle_v2i64_v8i64__8_1() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -16853,22 +16757,22 @@ define void @s_shuffle_v2i64_v8i64__9_1() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -16887,18 +16791,10 @@ define void @s_shuffle_v2i64_v8i64__10_1() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -16910,18 +16806,10 @@ define void @s_shuffle_v2i64_v8i64__10_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -16942,18 +16830,10 @@ define void @s_shuffle_v2i64_v8i64__10_1() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -16965,18 +16845,10 @@ define void @s_shuffle_v2i64_v8i64__10_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -16987,22 +16859,22 @@ define void @s_shuffle_v2i64_v8i64__10_1() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s2 -; GFX940-NEXT: s_mov_b32 s11, s3 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s2 +; GFX942-NEXT: s_mov_b32 s11, s3 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -17043,22 +16915,22 @@ define void @s_shuffle_v2i64_v8i64__11_1() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s18 -; GFX940-NEXT: s_mov_b32 s9, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -17103,33 +16975,33 @@ define void @s_shuffle_v2i64_v8i64__12_1() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s18 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s11, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -17170,22 +17042,22 @@ define void @s_shuffle_v2i64_v8i64__13_1() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s22 -; GFX940-NEXT: s_mov_b32 s9, s23 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s22 +; GFX942-NEXT: s_mov_b32 s9, s23 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -17230,24 +17102,24 @@ define void @s_shuffle_v2i64_v8i64__14_1() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s18, s2 -; GFX940-NEXT: s_mov_b32 s19, s3 -; GFX940-NEXT: s_mov_b64 s[8:9], s[16:17] -; GFX940-NEXT: s_mov_b64 s[10:11], s[18:19] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s18, s2 +; GFX942-NEXT: s_mov_b32 s19, s3 +; GFX942-NEXT: s_mov_b64 s[8:9], s[16:17] +; GFX942-NEXT: s_mov_b64 s[10:11], s[18:19] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -17282,18 +17154,18 @@ define void @s_shuffle_v2i64_v8i64__u_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -17350,20 +17222,20 @@ define void @s_shuffle_v2i64_v8i64__1_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -17420,20 +17292,20 @@ define void @s_shuffle_v2i64_v8i64__3_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -17471,18 +17343,18 @@ define void @s_shuffle_v2i64_v8i64__4_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -17520,20 +17392,20 @@ define void @s_shuffle_v2i64_v8i64__5_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -17571,20 +17443,20 @@ define void @s_shuffle_v2i64_v8i64__6_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s4 -; GFX940-NEXT: s_mov_b32 s15, s5 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -17622,20 +17494,20 @@ define void @s_shuffle_v2i64_v8i64__7_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -17669,18 +17541,18 @@ define void @s_shuffle_v2i64_v8i64__8_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -17746,24 +17618,24 @@ define void @s_shuffle_v2i64_v8i64__9_2() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -17826,34 +17698,34 @@ define void @s_shuffle_v2i64_v8i64__10_2() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s10, s20 -; GFX940-NEXT: s_mov_b32 s11, s21 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s10, s20 +; GFX942-NEXT: s_mov_b32 s11, s21 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -17920,24 +17792,24 @@ define void @s_shuffle_v2i64_v8i64__11_2() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -17982,22 +17854,22 @@ define void @s_shuffle_v2i64_v8i64__12_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s16 -; GFX940-NEXT: s_mov_b32 s11, s17 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -18064,24 +17936,24 @@ define void @s_shuffle_v2i64_v8i64__13_2() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s18 -; GFX940-NEXT: s_mov_b32 s9, s19 -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -18126,24 +17998,24 @@ define void @s_shuffle_v2i64_v8i64__14_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_2: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s22, s4 -; GFX940-NEXT: s_mov_b32 s23, s5 -; GFX940-NEXT: s_mov_b64 s[8:9], s[20:21] -; GFX940-NEXT: s_mov_b64 s[10:11], s[22:23] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s22, s4 +; GFX942-NEXT: s_mov_b32 s23, s5 +; GFX942-NEXT: s_mov_b64 s[8:9], s[20:21] +; GFX942-NEXT: s_mov_b64 s[10:11], s[22:23] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -18174,17 +18046,17 @@ define void @s_shuffle_v2i64_v8i64__u_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -18252,17 +18124,17 @@ define void @s_shuffle_v2i64_v8i64__2_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -18319,18 +18191,18 @@ define void @s_shuffle_v2i64_v8i64__4_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s6 -; GFX940-NEXT: s_mov_b32 s11, s7 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -18387,20 +18259,20 @@ define void @s_shuffle_v2i64_v8i64__6_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s6 -; GFX940-NEXT: s_mov_b32 s15, s7 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -18449,17 +18321,17 @@ define void @s_shuffle_v2i64_v8i64__8_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -18499,22 +18371,22 @@ define void @s_shuffle_v2i64_v8i64__9_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -18577,34 +18449,34 @@ define void @s_shuffle_v2i64_v8i64__10_3() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s10, s22 -; GFX940-NEXT: s_mov_b32 s11, s23 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s10, s22 +; GFX942-NEXT: s_mov_b32 s11, s23 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -18645,22 +18517,22 @@ define void @s_shuffle_v2i64_v8i64__11_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s18 -; GFX940-NEXT: s_mov_b32 s9, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -18705,22 +18577,22 @@ define void @s_shuffle_v2i64_v8i64__12_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s18 -; GFX940-NEXT: s_mov_b32 s11, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -18761,22 +18633,22 @@ define void @s_shuffle_v2i64_v8i64__13_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s22 -; GFX940-NEXT: s_mov_b32 s9, s23 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s22 +; GFX942-NEXT: s_mov_b32 s9, s23 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -18821,24 +18693,24 @@ define void @s_shuffle_v2i64_v8i64__14_3() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_3: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s22, s6 -; GFX940-NEXT: s_mov_b32 s23, s7 -; GFX940-NEXT: s_mov_b64 s[8:9], s[20:21] -; GFX940-NEXT: s_mov_b64 s[10:11], s[22:23] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_3: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s22, s6 +; GFX942-NEXT: s_mov_b32 s23, s7 +; GFX942-NEXT: s_mov_b64 s[8:9], s[20:21] +; GFX942-NEXT: s_mov_b64 s[10:11], s[22:23] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -18873,18 +18745,18 @@ define void @s_shuffle_v2i64_v8i64__u_4() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s8 -; GFX940-NEXT: s_mov_b32 s11, s9 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -19002,18 +18874,18 @@ define void @s_shuffle_v2i64_v8i64__4_4() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s8 -; GFX940-NEXT: s_mov_b32 s11, s9 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -19072,20 +18944,20 @@ define void @s_shuffle_v2i64_v8i64__6_4() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s8 -; GFX940-NEXT: s_mov_b32 s15, s9 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s8 +; GFX942-NEXT: s_mov_b32 s15, s9 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -19140,18 +19012,18 @@ define void @s_shuffle_v2i64_v8i64__8_4() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s8 -; GFX940-NEXT: s_mov_b32 s11, s9 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -19217,35 +19089,35 @@ define void @s_shuffle_v2i64_v8i64__9_4() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s18 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s9, s19 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -19286,22 +19158,22 @@ define void @s_shuffle_v2i64_v8i64__10_4() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s20 -; GFX940-NEXT: s_mov_b32 s11, s21 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s20 +; GFX942-NEXT: s_mov_b32 s11, s21 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -19368,35 +19240,35 @@ define void @s_shuffle_v2i64_v8i64__11_4() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s22 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s9, s23 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s22 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s9, s23 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -19463,22 +19335,22 @@ define void @s_shuffle_v2i64_v8i64__12_4() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s16 -; GFX940-NEXT: s_mov_b32 s11, s17 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -19545,35 +19417,35 @@ define void @s_shuffle_v2i64_v8i64__13_4() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s26 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s9, s27 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s26 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s9, s27 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -19640,24 +19512,24 @@ define void @s_shuffle_v2i64_v8i64__14_4() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_4: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s26, s8 -; GFX940-NEXT: s_mov_b32 s27, s9 -; GFX940-NEXT: s_mov_b64 s[8:9], s[24:25] -; GFX940-NEXT: s_mov_b64 s[10:11], s[26:27] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s26, s8 +; GFX942-NEXT: s_mov_b32 s27, s9 +; GFX942-NEXT: s_mov_b64 s[8:9], s[24:25] +; GFX942-NEXT: s_mov_b64 s[10:11], s[26:27] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -19692,17 +19564,17 @@ define void @s_shuffle_v2i64_v8i64__u_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -19759,18 +19631,18 @@ define void @s_shuffle_v2i64_v8i64__1_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -19827,18 +19699,18 @@ define void @s_shuffle_v2i64_v8i64__3_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -19872,17 +19744,17 @@ define void @s_shuffle_v2i64_v8i64__4_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -19920,18 +19792,18 @@ define void @s_shuffle_v2i64_v8i64__5_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -19969,20 +19841,20 @@ define void @s_shuffle_v2i64_v8i64__6_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s10 -; GFX940-NEXT: s_mov_b32 s15, s11 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -20020,18 +19892,18 @@ define void @s_shuffle_v2i64_v8i64__7_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -20065,17 +19937,17 @@ define void @s_shuffle_v2i64_v8i64__8_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -20141,22 +20013,22 @@ define void @s_shuffle_v2i64_v8i64__9_5() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -20197,22 +20069,22 @@ define void @s_shuffle_v2i64_v8i64__10_5() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s22 -; GFX940-NEXT: s_mov_b32 s11, s23 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s22 +; GFX942-NEXT: s_mov_b32 s11, s23 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -20279,22 +20151,22 @@ define void @s_shuffle_v2i64_v8i64__11_5() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s18 -; GFX940-NEXT: s_mov_b32 s9, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -20361,22 +20233,22 @@ define void @s_shuffle_v2i64_v8i64__12_5() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s18 -; GFX940-NEXT: s_mov_b32 s11, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -20443,22 +20315,22 @@ define void @s_shuffle_v2i64_v8i64__13_5() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s22 -; GFX940-NEXT: s_mov_b32 s9, s23 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s22 +; GFX942-NEXT: s_mov_b32 s9, s23 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -20525,24 +20397,24 @@ define void @s_shuffle_v2i64_v8i64__14_5() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_5: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s26, s10 -; GFX940-NEXT: s_mov_b32 s27, s11 -; GFX940-NEXT: s_mov_b64 s[8:9], s[24:25] -; GFX940-NEXT: s_mov_b64 s[10:11], s[26:27] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_5: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s26, s10 +; GFX942-NEXT: s_mov_b32 s27, s11 +; GFX942-NEXT: s_mov_b64 s[8:9], s[24:25] +; GFX942-NEXT: s_mov_b64 s[10:11], s[26:27] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -20577,18 +20449,18 @@ define void @s_shuffle_v2i64_v8i64__u_6() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -20645,20 +20517,20 @@ define void @s_shuffle_v2i64_v8i64__1_6() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -20715,20 +20587,20 @@ define void @s_shuffle_v2i64_v8i64__3_6() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -20766,18 +20638,18 @@ define void @s_shuffle_v2i64_v8i64__4_6() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -20815,20 +20687,20 @@ define void @s_shuffle_v2i64_v8i64__5_6() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -20866,20 +20738,20 @@ define void @s_shuffle_v2i64_v8i64__6_6() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s12 -; GFX940-NEXT: s_mov_b32 s15, s13 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -20917,20 +20789,20 @@ define void @s_shuffle_v2i64_v8i64__7_6() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -20964,18 +20836,18 @@ define void @s_shuffle_v2i64_v8i64__8_6() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -20993,18 +20865,10 @@ define void @s_shuffle_v2i64_v8i64__9_6() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -21018,18 +20882,10 @@ define void @s_shuffle_v2i64_v8i64__9_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -21050,18 +20906,10 @@ define void @s_shuffle_v2i64_v8i64__9_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -21075,18 +20923,10 @@ define void @s_shuffle_v2i64_v8i64__9_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -21097,35 +20937,35 @@ define void @s_shuffle_v2i64_v8i64__9_6() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s18 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s9, s19 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -21166,22 +21006,22 @@ define void @s_shuffle_v2i64_v8i64__10_6() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s20 -; GFX940-NEXT: s_mov_b32 s11, s21 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s20 +; GFX942-NEXT: s_mov_b32 s11, s21 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -21200,18 +21040,10 @@ define void @s_shuffle_v2i64_v8i64__11_6() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -21225,18 +21057,10 @@ define void @s_shuffle_v2i64_v8i64__11_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -21257,18 +21081,10 @@ define void @s_shuffle_v2i64_v8i64__11_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -21282,18 +21098,10 @@ define void @s_shuffle_v2i64_v8i64__11_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -21304,35 +21112,35 @@ define void @s_shuffle_v2i64_v8i64__11_6() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s22 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s9, s23 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s22 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s9, s23 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -21351,18 +21159,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -21376,18 +21176,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -21408,18 +21200,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -21433,18 +21217,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -21455,22 +21231,22 @@ define void @s_shuffle_v2i64_v8i64__12_6() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s16 -; GFX940-NEXT: s_mov_b32 s11, s17 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -21489,18 +21265,10 @@ define void @s_shuffle_v2i64_v8i64__13_6() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -21514,18 +21282,10 @@ define void @s_shuffle_v2i64_v8i64__13_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -21546,18 +21306,10 @@ define void @s_shuffle_v2i64_v8i64__13_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -21571,18 +21323,10 @@ define void @s_shuffle_v2i64_v8i64__13_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -21593,35 +21337,35 @@ define void @s_shuffle_v2i64_v8i64__13_6() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s26 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s9, s27 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s26 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s9, s27 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -21640,18 +21384,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -21665,18 +21401,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -21697,18 +21425,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -21722,18 +21442,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -21744,35 +21456,35 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_6: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s30, s12 -; GFX940-NEXT: s_mov_b32 s31, s13 -; GFX940-NEXT: s_mov_b64 s[8:9], s[28:29] -; GFX940-NEXT: s_mov_b64 s[10:11], s[30:31] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_6: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s30, s12 +; GFX942-NEXT: s_mov_b32 s31, s13 +; GFX942-NEXT: s_mov_b64 s[8:9], s[28:29] +; GFX942-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -21807,18 +21519,18 @@ define void @s_shuffle_v2i64_v8i64__u_7() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -21875,20 +21587,20 @@ define void @s_shuffle_v2i64_v8i64__1_7() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s12, s2 -; GFX940-NEXT: s_mov_b32 s13, s3 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -21945,20 +21657,20 @@ define void @s_shuffle_v2i64_v8i64__3_7() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s12, s6 -; GFX940-NEXT: s_mov_b32 s13, s7 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -21996,18 +21708,18 @@ define void @s_shuffle_v2i64_v8i64__4_7() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s14 -; GFX940-NEXT: s_mov_b32 s11, s15 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -22045,20 +21757,20 @@ define void @s_shuffle_v2i64_v8i64__5_7() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s12, s10 -; GFX940-NEXT: s_mov_b32 s13, s11 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -22092,18 +21804,18 @@ define void @s_shuffle_v2i64_v8i64__6_7() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -22141,20 +21853,20 @@ define void @s_shuffle_v2i64_v8i64__7_7() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s12, s14 -; GFX940-NEXT: s_mov_b32 s13, s15 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s14 +; GFX942-NEXT: s_mov_b32 s13, s15 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -22188,18 +21900,18 @@ define void @s_shuffle_v2i64_v8i64__8_7() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -22217,18 +21929,10 @@ define void @s_shuffle_v2i64_v8i64__9_7() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -22242,18 +21946,10 @@ define void @s_shuffle_v2i64_v8i64__9_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -22274,18 +21970,10 @@ define void @s_shuffle_v2i64_v8i64__9_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -22299,18 +21987,10 @@ define void @s_shuffle_v2i64_v8i64__9_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -22321,35 +22001,35 @@ define void @s_shuffle_v2i64_v8i64__9_7() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s12, s18 -; GFX940-NEXT: s_mov_b32 s13, s19 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s18 +; GFX942-NEXT: s_mov_b32 s13, s19 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -22390,22 +22070,22 @@ define void @s_shuffle_v2i64_v8i64__10_7() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s22 -; GFX940-NEXT: s_mov_b32 s11, s23 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s22 +; GFX942-NEXT: s_mov_b32 s11, s23 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -22424,18 +22104,10 @@ define void @s_shuffle_v2i64_v8i64__11_7() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -22449,18 +22121,10 @@ define void @s_shuffle_v2i64_v8i64__11_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -22481,18 +22145,10 @@ define void @s_shuffle_v2i64_v8i64__11_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -22506,18 +22162,10 @@ define void @s_shuffle_v2i64_v8i64__11_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -22528,35 +22176,35 @@ define void @s_shuffle_v2i64_v8i64__11_7() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s12, s22 -; GFX940-NEXT: s_mov_b32 s13, s23 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s22 +; GFX942-NEXT: s_mov_b32 s13, s23 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -22575,18 +22223,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -22600,18 +22240,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -22632,18 +22264,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -22657,18 +22281,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -22679,22 +22295,22 @@ define void @s_shuffle_v2i64_v8i64__12_7() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s18 -; GFX940-NEXT: s_mov_b32 s11, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -22713,18 +22329,10 @@ define void @s_shuffle_v2i64_v8i64__13_7() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -22738,18 +22346,10 @@ define void @s_shuffle_v2i64_v8i64__13_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -22770,18 +22370,10 @@ define void @s_shuffle_v2i64_v8i64__13_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -22795,18 +22387,10 @@ define void @s_shuffle_v2i64_v8i64__13_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -22817,35 +22401,35 @@ define void @s_shuffle_v2i64_v8i64__13_7() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s12, s26 -; GFX940-NEXT: s_mov_b32 s13, s27 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s26 +; GFX942-NEXT: s_mov_b32 s13, s27 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -22864,18 +22448,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -22889,18 +22465,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -22921,18 +22489,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -22946,18 +22506,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -22968,35 +22520,35 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_7: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s30, s14 -; GFX940-NEXT: s_mov_b32 s31, s15 -; GFX940-NEXT: s_mov_b64 s[8:9], s[28:29] -; GFX940-NEXT: s_mov_b64 s[10:11], s[30:31] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_7: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s30, s14 +; GFX942-NEXT: s_mov_b32 s31, s15 +; GFX942-NEXT: s_mov_b64 s[8:9], s[28:29] +; GFX942-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -23041,17 +22593,17 @@ define void @s_shuffle_v2i64_v8i64__0_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -23085,18 +22637,18 @@ define void @s_shuffle_v2i64_v8i64__1_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -23126,17 +22678,17 @@ define void @s_shuffle_v2i64_v8i64__2_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -23170,18 +22722,18 @@ define void @s_shuffle_v2i64_v8i64__3_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -23215,17 +22767,17 @@ define void @s_shuffle_v2i64_v8i64__4_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -23259,18 +22811,18 @@ define void @s_shuffle_v2i64_v8i64__5_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -23304,18 +22856,18 @@ define void @s_shuffle_v2i64_v8i64__6_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -23349,18 +22901,18 @@ define void @s_shuffle_v2i64_v8i64__7_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -23412,20 +22964,20 @@ define void @s_shuffle_v2i64_v8i64__9_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -23484,20 +23036,20 @@ define void @s_shuffle_v2i64_v8i64__11_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -23536,18 +23088,18 @@ define void @s_shuffle_v2i64_v8i64__12_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -23586,20 +23138,20 @@ define void @s_shuffle_v2i64_v8i64__13_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: s_mov_b32 s10, s0 -; GFX940-NEXT: s_mov_b32 s11, s1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s11, s1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -23638,20 +23190,20 @@ define void @s_shuffle_v2i64_v8i64__14_8() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_8: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s0 -; GFX940-NEXT: s_mov_b32 s15, s1 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_8: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: s_mov_b32 s15, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -23682,17 +23234,17 @@ define void @s_shuffle_v2i64_v8i64__u_9() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -23733,22 +23285,22 @@ define void @s_shuffle_v2i64_v8i64__0_9() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s14 -; GFX940-NEXT: s_mov_b32 s11, s15 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -23789,22 +23341,22 @@ define void @s_shuffle_v2i64_v8i64__1_9() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -23845,22 +23397,22 @@ define void @s_shuffle_v2i64_v8i64__2_9() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s14 -; GFX940-NEXT: s_mov_b32 s11, s15 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -23879,18 +23431,10 @@ define void @s_shuffle_v2i64_v8i64__3_9() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -23902,18 +23446,10 @@ define void @s_shuffle_v2i64_v8i64__3_9() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -23934,18 +23470,10 @@ define void @s_shuffle_v2i64_v8i64__3_9() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -23957,18 +23485,10 @@ define void @s_shuffle_v2i64_v8i64__3_9() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -23979,22 +23499,22 @@ define void @s_shuffle_v2i64_v8i64__3_9() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -24061,22 +23581,22 @@ define void @s_shuffle_v2i64_v8i64__4_9() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s14 -; GFX940-NEXT: s_mov_b32 s11, s15 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -24139,34 +23659,34 @@ define void @s_shuffle_v2i64_v8i64__5_9() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s8, s26 -; GFX940-NEXT: s_mov_b32 s9, s27 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s8, s26 +; GFX942-NEXT: s_mov_b32 s9, s27 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -24185,18 +23705,10 @@ define void @s_shuffle_v2i64_v8i64__6_9() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -24210,18 +23722,10 @@ define void @s_shuffle_v2i64_v8i64__6_9() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -24242,18 +23746,10 @@ define void @s_shuffle_v2i64_v8i64__6_9() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -24267,18 +23763,10 @@ define void @s_shuffle_v2i64_v8i64__6_9() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -24289,35 +23777,35 @@ define void @s_shuffle_v2i64_v8i64__6_9() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s18 -; GFX940-NEXT: s_mov_b32 s15, s19 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s18 +; GFX942-NEXT: s_mov_b32 s15, s19 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -24358,22 +23846,22 @@ define void @s_shuffle_v2i64_v8i64__7_9() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s26 -; GFX940-NEXT: s_mov_b32 s9, s27 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s26 +; GFX942-NEXT: s_mov_b32 s9, s27 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -24404,17 +23892,17 @@ define void @s_shuffle_v2i64_v8i64__8_9() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -24513,18 +24001,18 @@ define void @s_shuffle_v2i64_v8i64__12_9() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s2 -; GFX940-NEXT: s_mov_b32 s11, s3 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s2 +; GFX942-NEXT: s_mov_b32 s11, s3 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -24583,20 +24071,20 @@ define void @s_shuffle_v2i64_v8i64__14_9() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_9: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s2 -; GFX940-NEXT: s_mov_b32 s15, s3 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_9: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s2 +; GFX942-NEXT: s_mov_b32 s15, s3 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -24631,18 +24119,18 @@ define void @s_shuffle_v2i64_v8i64__u_10() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -24683,22 +24171,22 @@ define void @s_shuffle_v2i64_v8i64__0_10() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s16 -; GFX940-NEXT: s_mov_b32 s11, s17 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -24743,24 +24231,24 @@ define void @s_shuffle_v2i64_v8i64__1_10() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -24801,22 +24289,22 @@ define void @s_shuffle_v2i64_v8i64__2_10() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s16 -; GFX940-NEXT: s_mov_b32 s11, s17 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -24861,24 +24349,24 @@ define void @s_shuffle_v2i64_v8i64__3_10() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -24945,22 +24433,22 @@ define void @s_shuffle_v2i64_v8i64__4_10() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s16 -; GFX940-NEXT: s_mov_b32 s11, s17 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -25027,23 +24515,23 @@ define void @s_shuffle_v2i64_v8i64__5_10() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: s_mov_b32 s10, s16 -; GFX940-NEXT: s_mov_b32 s11, s17 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -25062,18 +24550,10 @@ define void @s_shuffle_v2i64_v8i64__6_10() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -25087,18 +24567,10 @@ define void @s_shuffle_v2i64_v8i64__6_10() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -25119,18 +24591,10 @@ define void @s_shuffle_v2i64_v8i64__6_10() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -25144,18 +24608,10 @@ define void @s_shuffle_v2i64_v8i64__6_10() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -25166,35 +24622,35 @@ define void @s_shuffle_v2i64_v8i64__6_10() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s20 -; GFX940-NEXT: s_mov_b32 s15, s21 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s20 +; GFX942-NEXT: s_mov_b32 s15, s21 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -25213,18 +24669,10 @@ define void @s_shuffle_v2i64_v8i64__7_10() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -25238,18 +24686,10 @@ define void @s_shuffle_v2i64_v8i64__7_10() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -25270,18 +24710,10 @@ define void @s_shuffle_v2i64_v8i64__7_10() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -25295,18 +24727,10 @@ define void @s_shuffle_v2i64_v8i64__7_10() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -25317,35 +24741,35 @@ define void @s_shuffle_v2i64_v8i64__7_10() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: s_mov_b32 s10, s20 -; GFX940-NEXT: s_mov_b32 s11, s21 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s20 +; GFX942-NEXT: s_mov_b32 s11, s21 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -25404,20 +24828,20 @@ define void @s_shuffle_v2i64_v8i64__9_10() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -25476,20 +24900,20 @@ define void @s_shuffle_v2i64_v8i64__11_10() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -25528,18 +24952,18 @@ define void @s_shuffle_v2i64_v8i64__12_10() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -25578,20 +25002,20 @@ define void @s_shuffle_v2i64_v8i64__13_10() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: s_mov_b32 s10, s4 -; GFX940-NEXT: s_mov_b32 s11, s5 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s4 +; GFX942-NEXT: s_mov_b32 s11, s5 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -25630,20 +25054,20 @@ define void @s_shuffle_v2i64_v8i64__14_10() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_10: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s4 -; GFX940-NEXT: s_mov_b32 s15, s5 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_10: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s4 +; GFX942-NEXT: s_mov_b32 s15, s5 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -25674,17 +25098,17 @@ define void @s_shuffle_v2i64_v8i64__u_11() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -25725,22 +25149,22 @@ define void @s_shuffle_v2i64_v8i64__0_11() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s18 -; GFX940-NEXT: s_mov_b32 s11, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -25759,18 +25183,10 @@ define void @s_shuffle_v2i64_v8i64__1_11() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -25782,18 +25198,10 @@ define void @s_shuffle_v2i64_v8i64__1_11() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -25814,18 +25222,10 @@ define void @s_shuffle_v2i64_v8i64__1_11() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -25837,18 +25237,10 @@ define void @s_shuffle_v2i64_v8i64__1_11() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -25859,22 +25251,22 @@ define void @s_shuffle_v2i64_v8i64__1_11() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -25915,22 +25307,22 @@ define void @s_shuffle_v2i64_v8i64__2_11() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s18 -; GFX940-NEXT: s_mov_b32 s11, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -25993,34 +25385,34 @@ define void @s_shuffle_v2i64_v8i64__3_11() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s8, s22 -; GFX940-NEXT: s_mov_b32 s9, s23 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s8, s22 +; GFX942-NEXT: s_mov_b32 s9, s23 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -26087,22 +25479,22 @@ define void @s_shuffle_v2i64_v8i64__4_11() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s18 -; GFX940-NEXT: s_mov_b32 s11, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -26143,22 +25535,22 @@ define void @s_shuffle_v2i64_v8i64__5_11() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s22 -; GFX940-NEXT: s_mov_b32 s9, s23 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s22 +; GFX942-NEXT: s_mov_b32 s9, s23 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -26177,18 +25569,10 @@ define void @s_shuffle_v2i64_v8i64__6_11() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -26202,18 +25586,10 @@ define void @s_shuffle_v2i64_v8i64__6_11() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -26234,18 +25610,10 @@ define void @s_shuffle_v2i64_v8i64__6_11() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -26259,18 +25627,10 @@ define void @s_shuffle_v2i64_v8i64__6_11() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -26281,35 +25641,35 @@ define void @s_shuffle_v2i64_v8i64__6_11() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s22 -; GFX940-NEXT: s_mov_b32 s15, s23 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s22 +; GFX942-NEXT: s_mov_b32 s15, s23 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -26350,22 +25710,22 @@ define void @s_shuffle_v2i64_v8i64__7_11() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s22 -; GFX940-NEXT: s_mov_b32 s9, s23 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s22 +; GFX942-NEXT: s_mov_b32 s9, s23 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -26436,17 +25796,17 @@ define void @s_shuffle_v2i64_v8i64__10_11() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -26505,18 +25865,18 @@ define void @s_shuffle_v2i64_v8i64__12_11() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s6 -; GFX940-NEXT: s_mov_b32 s11, s7 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -26575,20 +25935,20 @@ define void @s_shuffle_v2i64_v8i64__14_11() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_11: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s6 -; GFX940-NEXT: s_mov_b32 s15, s7 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_11: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s6 +; GFX942-NEXT: s_mov_b32 s15, s7 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -26623,18 +25983,18 @@ define void @s_shuffle_v2i64_v8i64__u_12() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s8 -; GFX940-NEXT: s_mov_b32 s11, s9 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -26675,22 +26035,22 @@ define void @s_shuffle_v2i64_v8i64__0_12() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s20 -; GFX940-NEXT: s_mov_b32 s11, s21 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s20 +; GFX942-NEXT: s_mov_b32 s11, s21 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -26735,24 +26095,24 @@ define void @s_shuffle_v2i64_v8i64__1_12() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -26793,22 +26153,22 @@ define void @s_shuffle_v2i64_v8i64__2_12() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s20 -; GFX940-NEXT: s_mov_b32 s11, s21 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s20 +; GFX942-NEXT: s_mov_b32 s11, s21 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -26853,24 +26213,24 @@ define void @s_shuffle_v2i64_v8i64__3_12() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: s_mov_b32 s10, s16 -; GFX940-NEXT: s_mov_b32 s11, s17 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -26937,22 +26297,22 @@ define void @s_shuffle_v2i64_v8i64__4_12() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s20 -; GFX940-NEXT: s_mov_b32 s11, s21 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s20 +; GFX942-NEXT: s_mov_b32 s11, s21 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -27019,23 +26379,23 @@ define void @s_shuffle_v2i64_v8i64__5_12() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: s_mov_b32 s10, s20 -; GFX940-NEXT: s_mov_b32 s11, s21 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s20 +; GFX942-NEXT: s_mov_b32 s11, s21 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -27054,18 +26414,10 @@ define void @s_shuffle_v2i64_v8i64__6_12() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -27079,18 +26431,10 @@ define void @s_shuffle_v2i64_v8i64__6_12() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -27111,18 +26455,10 @@ define void @s_shuffle_v2i64_v8i64__6_12() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -27136,18 +26472,10 @@ define void @s_shuffle_v2i64_v8i64__6_12() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -27158,35 +26486,35 @@ define void @s_shuffle_v2i64_v8i64__6_12() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s24 -; GFX940-NEXT: s_mov_b32 s15, s25 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s24 +; GFX942-NEXT: s_mov_b32 s15, s25 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -27205,18 +26533,10 @@ define void @s_shuffle_v2i64_v8i64__7_12() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -27230,18 +26550,10 @@ define void @s_shuffle_v2i64_v8i64__7_12() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -27262,18 +26574,10 @@ define void @s_shuffle_v2i64_v8i64__7_12() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -27287,18 +26591,10 @@ define void @s_shuffle_v2i64_v8i64__7_12() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -27309,35 +26605,35 @@ define void @s_shuffle_v2i64_v8i64__7_12() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: s_mov_b32 s10, s24 -; GFX940-NEXT: s_mov_b32 s11, s25 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s24 +; GFX942-NEXT: s_mov_b32 s11, s25 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -27460,18 +26756,18 @@ define void @s_shuffle_v2i64_v8i64__12_12() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s8 -; GFX940-NEXT: s_mov_b32 s11, s9 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s8 +; GFX942-NEXT: s_mov_b32 s11, s9 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -27532,20 +26828,20 @@ define void @s_shuffle_v2i64_v8i64__14_12() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_12: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s8 -; GFX940-NEXT: s_mov_b32 s15, s9 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_12: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s8 +; GFX942-NEXT: s_mov_b32 s15, s9 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -27580,17 +26876,17 @@ define void @s_shuffle_v2i64_v8i64__u_13() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -27631,22 +26927,22 @@ define void @s_shuffle_v2i64_v8i64__0_13() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s22 -; GFX940-NEXT: s_mov_b32 s11, s23 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s22 +; GFX942-NEXT: s_mov_b32 s11, s23 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -27691,33 +26987,33 @@ define void @s_shuffle_v2i64_v8i64__1_13() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s18 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s9, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -27758,22 +27054,22 @@ define void @s_shuffle_v2i64_v8i64__2_13() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s22 -; GFX940-NEXT: s_mov_b32 s11, s23 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s22 +; GFX942-NEXT: s_mov_b32 s11, s23 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -27818,22 +27114,22 @@ define void @s_shuffle_v2i64_v8i64__3_13() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s18 -; GFX940-NEXT: s_mov_b32 s9, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -27900,22 +27196,22 @@ define void @s_shuffle_v2i64_v8i64__4_13() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s22 -; GFX940-NEXT: s_mov_b32 s11, s23 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s22 +; GFX942-NEXT: s_mov_b32 s11, s23 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -27982,22 +27278,22 @@ define void @s_shuffle_v2i64_v8i64__5_13() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s18 -; GFX940-NEXT: s_mov_b32 s9, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -28016,18 +27312,10 @@ define void @s_shuffle_v2i64_v8i64__6_13() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -28041,18 +27329,10 @@ define void @s_shuffle_v2i64_v8i64__6_13() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -28073,18 +27353,10 @@ define void @s_shuffle_v2i64_v8i64__6_13() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -28098,18 +27370,10 @@ define void @s_shuffle_v2i64_v8i64__6_13() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -28120,35 +27384,35 @@ define void @s_shuffle_v2i64_v8i64__6_13() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s26 -; GFX940-NEXT: s_mov_b32 s15, s27 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s26 +; GFX942-NEXT: s_mov_b32 s15, s27 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -28167,18 +27431,10 @@ define void @s_shuffle_v2i64_v8i64__7_13() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -28192,18 +27448,10 @@ define void @s_shuffle_v2i64_v8i64__7_13() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -28224,18 +27472,10 @@ define void @s_shuffle_v2i64_v8i64__7_13() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -28249,18 +27489,10 @@ define void @s_shuffle_v2i64_v8i64__7_13() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -28271,22 +27503,22 @@ define void @s_shuffle_v2i64_v8i64__7_13() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s18 -; GFX940-NEXT: s_mov_b32 s9, s19 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -28345,18 +27577,18 @@ define void @s_shuffle_v2i64_v8i64__9_13() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -28415,18 +27647,18 @@ define void @s_shuffle_v2i64_v8i64__11_13() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -28461,17 +27693,17 @@ define void @s_shuffle_v2i64_v8i64__12_13() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -28510,18 +27742,18 @@ define void @s_shuffle_v2i64_v8i64__13_13() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -28560,20 +27792,20 @@ define void @s_shuffle_v2i64_v8i64__14_13() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_13: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s10 -; GFX940-NEXT: s_mov_b32 s15, s11 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_13: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s10 +; GFX942-NEXT: s_mov_b32 s15, s11 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -28608,18 +27840,18 @@ define void @s_shuffle_v2i64_v8i64__u_14() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -28660,22 +27892,22 @@ define void @s_shuffle_v2i64_v8i64__0_14() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s24 -; GFX940-NEXT: s_mov_b32 s11, s25 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s24 +; GFX942-NEXT: s_mov_b32 s11, s25 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -28720,24 +27952,24 @@ define void @s_shuffle_v2i64_v8i64__1_14() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: s_mov_b32 s10, s16 -; GFX940-NEXT: s_mov_b32 s11, s17 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -28778,22 +28010,22 @@ define void @s_shuffle_v2i64_v8i64__2_14() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s24 -; GFX940-NEXT: s_mov_b32 s11, s25 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s24 +; GFX942-NEXT: s_mov_b32 s11, s25 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -28838,24 +28070,24 @@ define void @s_shuffle_v2i64_v8i64__3_14() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: s_mov_b32 s10, s20 -; GFX940-NEXT: s_mov_b32 s11, s21 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s20 +; GFX942-NEXT: s_mov_b32 s11, s21 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -28922,22 +28154,22 @@ define void @s_shuffle_v2i64_v8i64__4_14() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s24 -; GFX940-NEXT: s_mov_b32 s11, s25 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s24 +; GFX942-NEXT: s_mov_b32 s11, s25 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29004,23 +28236,23 @@ define void @s_shuffle_v2i64_v8i64__5_14() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: s_mov_b32 s10, s24 -; GFX940-NEXT: s_mov_b32 s11, s25 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s24 +; GFX942-NEXT: s_mov_b32 s11, s25 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29039,18 +28271,10 @@ define void @s_shuffle_v2i64_v8i64__6_14() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -29064,18 +28288,10 @@ define void @s_shuffle_v2i64_v8i64__6_14() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -29096,18 +28312,10 @@ define void @s_shuffle_v2i64_v8i64__6_14() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -29121,18 +28329,10 @@ define void @s_shuffle_v2i64_v8i64__6_14() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -29143,35 +28343,35 @@ define void @s_shuffle_v2i64_v8i64__6_14() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s28 -; GFX940-NEXT: s_mov_b32 s15, s29 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s28 +; GFX942-NEXT: s_mov_b32 s15, s29 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29190,18 +28390,10 @@ define void @s_shuffle_v2i64_v8i64__7_14() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -29215,18 +28407,10 @@ define void @s_shuffle_v2i64_v8i64__7_14() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -29247,18 +28431,10 @@ define void @s_shuffle_v2i64_v8i64__7_14() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -29272,18 +28448,10 @@ define void @s_shuffle_v2i64_v8i64__7_14() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -29294,35 +28462,35 @@ define void @s_shuffle_v2i64_v8i64__7_14() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s14 -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b32 s9, s15 -; GFX940-NEXT: s_mov_b32 s10, s28 -; GFX940-NEXT: s_mov_b32 s11, s29 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s10, s28 +; GFX942-NEXT: s_mov_b32 s11, s29 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29381,20 +28549,20 @@ define void @s_shuffle_v2i64_v8i64__9_14() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s2 -; GFX940-NEXT: s_mov_b32 s9, s3 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29453,20 +28621,20 @@ define void @s_shuffle_v2i64_v8i64__11_14() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s6 -; GFX940-NEXT: s_mov_b32 s9, s7 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s6 +; GFX942-NEXT: s_mov_b32 s9, s7 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29505,18 +28673,18 @@ define void @s_shuffle_v2i64_v8i64__12_14() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29555,20 +28723,20 @@ define void @s_shuffle_v2i64_v8i64__13_14() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s8, s10 -; GFX940-NEXT: s_mov_b32 s9, s11 -; GFX940-NEXT: s_mov_b32 s10, s12 -; GFX940-NEXT: s_mov_b32 s11, s13 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s10 +; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29607,20 +28775,20 @@ define void @s_shuffle_v2i64_v8i64__14_14() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_14: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s12 -; GFX940-NEXT: s_mov_b32 s15, s13 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_14: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s12 +; GFX942-NEXT: s_mov_b32 s15, s13 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29655,18 +28823,18 @@ define void @s_shuffle_v2i64_v8i64__u_15() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29707,22 +28875,22 @@ define void @s_shuffle_v2i64_v8i64__0_15() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s26 -; GFX940-NEXT: s_mov_b32 s11, s27 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s26 +; GFX942-NEXT: s_mov_b32 s11, s27 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29767,24 +28935,24 @@ define void @s_shuffle_v2i64_v8i64__1_15() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s16, s2 -; GFX940-NEXT: s_mov_b32 s17, s3 -; GFX940-NEXT: s_mov_b64 s[8:9], s[16:17] -; GFX940-NEXT: s_mov_b64 s[10:11], s[18:19] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s16, s2 +; GFX942-NEXT: s_mov_b32 s17, s3 +; GFX942-NEXT: s_mov_b64 s[8:9], s[16:17] +; GFX942-NEXT: s_mov_b64 s[10:11], s[18:19] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29825,22 +28993,22 @@ define void @s_shuffle_v2i64_v8i64__2_15() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[4:19] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s26 -; GFX940-NEXT: s_mov_b32 s11, s27 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[4:19] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s26 +; GFX942-NEXT: s_mov_b32 s11, s27 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29885,24 +29053,24 @@ define void @s_shuffle_v2i64_v8i64__3_15() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[8:23] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s20, s6 -; GFX940-NEXT: s_mov_b32 s21, s7 -; GFX940-NEXT: s_mov_b64 s[8:9], s[20:21] -; GFX940-NEXT: s_mov_b64 s[10:11], s[22:23] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[8:23] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s20, s6 +; GFX942-NEXT: s_mov_b32 s21, s7 +; GFX942-NEXT: s_mov_b64 s[8:9], s[20:21] +; GFX942-NEXT: s_mov_b64 s[10:11], s[22:23] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -29969,22 +29137,22 @@ define void @s_shuffle_v2i64_v8i64__4_15() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s26 -; GFX940-NEXT: s_mov_b32 s11, s27 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s26 +; GFX942-NEXT: s_mov_b32 s11, s27 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -30051,24 +29219,24 @@ define void @s_shuffle_v2i64_v8i64__5_15() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[12:27] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s24, s10 -; GFX940-NEXT: s_mov_b32 s25, s11 -; GFX940-NEXT: s_mov_b64 s[8:9], s[24:25] -; GFX940-NEXT: s_mov_b64 s[10:11], s[26:27] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[12:27] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s24, s10 +; GFX942-NEXT: s_mov_b32 s25, s11 +; GFX942-NEXT: s_mov_b64 s[8:9], s[24:25] +; GFX942-NEXT: s_mov_b64 s[10:11], s[26:27] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -30087,18 +29255,10 @@ define void @s_shuffle_v2i64_v8i64__6_15() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND @@ -30112,18 +29272,10 @@ define void @s_shuffle_v2i64_v8i64__6_15() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -30144,18 +29296,10 @@ define void @s_shuffle_v2i64_v8i64__6_15() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND @@ -30169,18 +29313,10 @@ define void @s_shuffle_v2i64_v8i64__6_15() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -30191,35 +29327,35 @@ define void @s_shuffle_v2i64_v8i64__6_15() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s14, s30 -; GFX940-NEXT: s_mov_b32 s15, s31 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__6_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s14, s30 +; GFX942-NEXT: s_mov_b32 s15, s31 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -30238,18 +29374,10 @@ define void @s_shuffle_v2i64_v8i64__7_15() { ; GFX900-NEXT: v_writelane_b32 v0, s37, 1 ; GFX900-NEXT: v_writelane_b32 v0, s38, 2 ; GFX900-NEXT: v_writelane_b32 v0, s39, 3 -; GFX900-NEXT: v_writelane_b32 v0, s40, 4 -; GFX900-NEXT: v_writelane_b32 v0, s41, 5 -; GFX900-NEXT: v_writelane_b32 v0, s42, 6 -; GFX900-NEXT: v_writelane_b32 v0, s43, 7 -; GFX900-NEXT: v_writelane_b32 v0, s44, 8 -; GFX900-NEXT: v_writelane_b32 v0, s45, 9 -; GFX900-NEXT: v_writelane_b32 v0, s46, 10 -; GFX900-NEXT: v_writelane_b32 v0, s47, 11 -; GFX900-NEXT: v_writelane_b32 v0, s48, 12 -; GFX900-NEXT: v_writelane_b32 v0, s49, 13 -; GFX900-NEXT: v_writelane_b32 v0, s50, 14 -; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: v_writelane_b32 v0, s48, 4 +; GFX900-NEXT: v_writelane_b32 v0, s49, 5 +; GFX900-NEXT: v_writelane_b32 v0, s50, 6 +; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND @@ -30263,18 +29391,10 @@ define void @s_shuffle_v2i64_v8i64__7_15() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s51, v0, 15 -; GFX900-NEXT: v_readlane_b32 s50, v0, 14 -; GFX900-NEXT: v_readlane_b32 s49, v0, 13 -; GFX900-NEXT: v_readlane_b32 s48, v0, 12 -; GFX900-NEXT: v_readlane_b32 s47, v0, 11 -; GFX900-NEXT: v_readlane_b32 s46, v0, 10 -; GFX900-NEXT: v_readlane_b32 s45, v0, 9 -; GFX900-NEXT: v_readlane_b32 s44, v0, 8 -; GFX900-NEXT: v_readlane_b32 s43, v0, 7 -; GFX900-NEXT: v_readlane_b32 s42, v0, 6 -; GFX900-NEXT: v_readlane_b32 s41, v0, 5 -; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s51, v0, 7 +; GFX900-NEXT: v_readlane_b32 s50, v0, 6 +; GFX900-NEXT: v_readlane_b32 s49, v0, 5 +; GFX900-NEXT: v_readlane_b32 s48, v0, 4 ; GFX900-NEXT: v_readlane_b32 s39, v0, 3 ; GFX900-NEXT: v_readlane_b32 s38, v0, 2 ; GFX900-NEXT: v_readlane_b32 s37, v0, 1 @@ -30295,18 +29415,10 @@ define void @s_shuffle_v2i64_v8i64__7_15() { ; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 ; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 ; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 -; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 -; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 -; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 -; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 -; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 -; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 -; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 -; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 -; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 -; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 -; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 -; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND @@ -30320,18 +29432,10 @@ define void @s_shuffle_v2i64_v8i64__7_15() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 -; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 -; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 -; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 -; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 -; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 -; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 -; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 -; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 -; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 -; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 -; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s51, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 4 ; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 ; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 ; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 @@ -30342,35 +29446,35 @@ define void @s_shuffle_v2i64_v8i64__7_15() { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v0, s30, 0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_writelane_b32 v0, s31, 1 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[16:31] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s28, s14 -; GFX940-NEXT: s_mov_b32 s29, s15 -; GFX940-NEXT: s_mov_b64 s[8:9], s[28:29] -; GFX940-NEXT: s_mov_b64 s[10:11], s[30:31] -; GFX940-NEXT: v_readlane_b32 s30, v0, 0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s31, v0, 1 -; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s28, s14 +; GFX942-NEXT: s_mov_b32 s29, s15 +; GFX942-NEXT: s_mov_b64 s[8:9], s[28:29] +; GFX942-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s31, v0, 1 +; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX942-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -30429,20 +29533,20 @@ define void @s_shuffle_v2i64_v8i64__9_15() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s12, s2 -; GFX940-NEXT: s_mov_b32 s13, s3 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: s_mov_b32 s13, s3 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -30501,20 +29605,20 @@ define void @s_shuffle_v2i64_v8i64__11_15() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s12, s6 -; GFX940-NEXT: s_mov_b32 s13, s7 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s6 +; GFX942-NEXT: s_mov_b32 s13, s7 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -30553,18 +29657,18 @@ define void @s_shuffle_v2i64_v8i64__12_15() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s10, s14 -; GFX940-NEXT: s_mov_b32 s11, s15 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__12_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s14 +; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -30603,20 +29707,20 @@ define void @s_shuffle_v2i64_v8i64__13_15() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b32 s12, s10 -; GFX940-NEXT: s_mov_b32 s13, s11 -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s12, s10 +; GFX942-NEXT: s_mov_b32 s13, s11 +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -30651,18 +29755,18 @@ define void @s_shuffle_v2i64_v8i64__14_15() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_15: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def s[0:15] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_mov_b64 s[8:9], s[12:13] -; GFX940-NEXT: s_mov_b64 s[10:11], s[14:15] -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[8:11] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: s_shuffle_v2i64_v8i64__14_15: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:11] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll index 15f6bb632f311..20e84335d6b20 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll @@ -12,17 +12,16 @@ define amdgpu_ps i32 @if_else(i32 %0) !dbg !5 { ; OPT-NEXT: [[TMP4:%.*]] = extractvalue { i1, i64 } [[TMP2]], 1, !dbg [[DBG14]] ; OPT-NEXT: br i1 [[TMP3]], label [[FALSE:%.*]], label [[FLOW:%.*]], !dbg [[DBG14]] ; OPT: Flow: -; OPT-NEXT: [[TMP5:%.*]] = phi i32 [ 33, [[FALSE]] ], [ undef, [[TMP1:%.*]] ] -; OPT-NEXT: [[TMP6:%.*]] = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 [[TMP4]]), !dbg [[DBG14]] -; OPT-NEXT: [[TMP7:%.*]] = extractvalue { i1, i64 } [[TMP6]], 0, !dbg [[DBG14]] -; OPT-NEXT: [[TMP8:%.*]] = extractvalue { i1, i64 } [[TMP6]], 1, !dbg [[DBG14]] -; OPT-NEXT: br i1 [[TMP7]], label [[TRUE:%.*]], label [[EXIT:%.*]], !dbg [[DBG14]] +; OPT-NEXT: [[TMP5:%.*]] = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 [[TMP4]]) +; OPT-NEXT: [[TMP6:%.*]] = extractvalue { i1, i64 } [[TMP5]], 0 +; OPT-NEXT: [[TMP8:%.*]] = extractvalue { i1, i64 } [[TMP5]], 1 +; OPT-NEXT: br i1 [[TMP6]], label [[TRUE:%.*]], label [[EXIT:%.*]] ; OPT: true: ; OPT-NEXT: br label [[EXIT]], !dbg [[DBG15:![0-9]+]] ; OPT: false: ; OPT-NEXT: br label [[FLOW]], !dbg [[DBG16:![0-9]+]] ; OPT: exit: -; OPT-NEXT: [[RET:%.*]] = phi i32 [ [[TMP5]], [[FLOW]] ], [ 42, [[TRUE]] ], !dbg [[DBG17:![0-9]+]] +; OPT-NEXT: [[RET:%.*]] = phi i32 [ 33, [[FLOW]] ], [ 42, [[TRUE]] ], !dbg [[DBG17:![0-9]+]] ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) ; OPT-NEXT: #dbg_value(i32 [[RET]], [[META11:![0-9]+]], !DIExpression(), [[DBG17]]) ; OPT-NEXT: ret i32 [[RET]], !dbg [[DBG18:![0-9]+]] @@ -63,12 +62,12 @@ define amdgpu_ps void @loop_if_break(i32 %n) !dbg !19 { ; OPT-NEXT: #dbg_value(i32 [[I_NEXT]], [[META23:![0-9]+]], !DIExpression(), [[DBG28]]) ; OPT-NEXT: br label [[FLOW]], !dbg [[DBG29:![0-9]+]] ; OPT: Flow: -; OPT-NEXT: [[TMP3]] = phi i32 [ [[I_NEXT]], [[LOOP_BODY]] ], [ undef, [[LOOP]] ] +; OPT-NEXT: [[TMP3]] = phi i32 [ [[I_NEXT]], [[LOOP_BODY]] ], [ poison, [[LOOP]] ] ; OPT-NEXT: [[TMP4:%.*]] = phi i1 [ false, [[LOOP_BODY]] ], [ true, [[LOOP]] ] ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]) -; OPT-NEXT: [[TMP5]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN]]), !dbg [[DBG27]] -; OPT-NEXT: [[TMP6:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP5]]), !dbg [[DBG27]] -; OPT-NEXT: br i1 [[TMP6]], label [[EXIT:%.*]], label [[LOOP]], !dbg [[DBG27]] +; OPT-NEXT: [[TMP5]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN]]) +; OPT-NEXT: [[TMP6:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP5]]) +; OPT-NEXT: br i1 [[TMP6]], label [[EXIT:%.*]], label [[LOOP]] ; OPT: exit: ; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP5]]) ; OPT-NEXT: ret void, !dbg [[DBG30:![0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir index b427974edeb66..400005a8dff2e 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir @@ -26,7 +26,8 @@ body: | ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_2]], [[V_XOR_B32_e64_]], implicit $exec - ; CHECK-NEXT: $sgpr0 = V_READFIRSTLANE_B32 [[V_AND_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_AND_B32_e64_]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0 %0:sgpr_32 = COPY $sgpr0 %2:sreg_32 = S_MOV_B32 16 diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index 022a8a5732ec6..9538e13108427 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -615,40 +615,40 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; FIJI-NEXT: v_writelane_b32 v40, s37, 5 ; FIJI-NEXT: v_writelane_b32 v40, s38, 6 ; FIJI-NEXT: v_writelane_b32 v40, s39, 7 -; FIJI-NEXT: v_writelane_b32 v40, s40, 8 -; FIJI-NEXT: v_writelane_b32 v40, s41, 9 -; FIJI-NEXT: v_writelane_b32 v40, s42, 10 -; FIJI-NEXT: v_writelane_b32 v40, s43, 11 -; FIJI-NEXT: v_writelane_b32 v40, s44, 12 -; FIJI-NEXT: v_writelane_b32 v40, s45, 13 -; FIJI-NEXT: v_writelane_b32 v40, s46, 14 -; FIJI-NEXT: v_writelane_b32 v40, s47, 15 -; FIJI-NEXT: v_writelane_b32 v40, s48, 16 -; FIJI-NEXT: s_mov_b32 s42, s15 -; FIJI-NEXT: s_mov_b32 s43, s14 -; FIJI-NEXT: s_mov_b32 s44, s13 -; FIJI-NEXT: s_mov_b32 s45, s12 +; FIJI-NEXT: v_writelane_b32 v40, s48, 8 +; FIJI-NEXT: v_writelane_b32 v40, s49, 9 +; FIJI-NEXT: v_writelane_b32 v40, s50, 10 +; FIJI-NEXT: v_writelane_b32 v40, s51, 11 +; FIJI-NEXT: v_writelane_b32 v40, s52, 12 +; FIJI-NEXT: v_writelane_b32 v40, s53, 13 +; FIJI-NEXT: v_writelane_b32 v40, s54, 14 +; FIJI-NEXT: v_writelane_b32 v40, s55, 15 +; FIJI-NEXT: v_writelane_b32 v40, s64, 16 +; FIJI-NEXT: s_mov_b32 s50, s15 +; FIJI-NEXT: s_mov_b32 s51, s14 +; FIJI-NEXT: s_mov_b32 s52, s13 +; FIJI-NEXT: s_mov_b32 s53, s12 ; FIJI-NEXT: s_mov_b64 s[34:35], s[10:11] ; FIJI-NEXT: s_mov_b64 s[36:37], s[8:9] ; FIJI-NEXT: s_mov_b64 s[38:39], s[6:7] -; FIJI-NEXT: s_mov_b64 s[40:41], s[4:5] +; FIJI-NEXT: s_mov_b64 s[48:49], s[4:5] ; FIJI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; FIJI-NEXT: s_mov_b64 s[46:47], exec +; FIJI-NEXT: s_mov_b64 s[54:55], exec ; FIJI-NEXT: s_addk_i32 s32, 0x400 -; FIJI-NEXT: v_writelane_b32 v40, s49, 17 +; FIJI-NEXT: v_writelane_b32 v40, s65, 17 ; FIJI-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; FIJI-NEXT: v_readfirstlane_b32 s16, v0 ; FIJI-NEXT: v_readfirstlane_b32 s17, v1 ; FIJI-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; FIJI-NEXT: s_and_saveexec_b64 s[48:49], vcc -; FIJI-NEXT: s_mov_b64 s[4:5], s[40:41] +; FIJI-NEXT: s_and_saveexec_b64 s[64:65], vcc +; FIJI-NEXT: s_mov_b64 s[4:5], s[48:49] ; FIJI-NEXT: s_mov_b64 s[6:7], s[38:39] ; FIJI-NEXT: s_mov_b64 s[8:9], s[36:37] ; FIJI-NEXT: s_mov_b64 s[10:11], s[34:35] -; FIJI-NEXT: s_mov_b32 s12, s45 -; FIJI-NEXT: s_mov_b32 s13, s44 -; FIJI-NEXT: s_mov_b32 s14, s43 -; FIJI-NEXT: s_mov_b32 s15, s42 +; FIJI-NEXT: s_mov_b32 s12, s53 +; FIJI-NEXT: s_mov_b32 s13, s52 +; FIJI-NEXT: s_mov_b32 s14, s51 +; FIJI-NEXT: s_mov_b32 s15, s50 ; FIJI-NEXT: v_mov_b32_e32 v0, v2 ; FIJI-NEXT: v_mov_b32_e32 v1, v3 ; FIJI-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -657,21 +657,21 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; FIJI-NEXT: ; implicit-def: $vgpr31 ; FIJI-NEXT: ; implicit-def: $vgpr2 ; FIJI-NEXT: ; implicit-def: $vgpr3 -; FIJI-NEXT: s_xor_b64 exec, exec, s[48:49] +; FIJI-NEXT: s_xor_b64 exec, exec, s[64:65] ; FIJI-NEXT: s_cbranch_execnz .LBB18_1 ; FIJI-NEXT: ; %bb.2: -; FIJI-NEXT: s_mov_b64 exec, s[46:47] +; FIJI-NEXT: s_mov_b64 exec, s[54:55] ; FIJI-NEXT: v_mov_b32_e32 v0, v4 -; FIJI-NEXT: v_readlane_b32 s49, v40, 17 -; FIJI-NEXT: v_readlane_b32 s48, v40, 16 -; FIJI-NEXT: v_readlane_b32 s47, v40, 15 -; FIJI-NEXT: v_readlane_b32 s46, v40, 14 -; FIJI-NEXT: v_readlane_b32 s45, v40, 13 -; FIJI-NEXT: v_readlane_b32 s44, v40, 12 -; FIJI-NEXT: v_readlane_b32 s43, v40, 11 -; FIJI-NEXT: v_readlane_b32 s42, v40, 10 -; FIJI-NEXT: v_readlane_b32 s41, v40, 9 -; FIJI-NEXT: v_readlane_b32 s40, v40, 8 +; FIJI-NEXT: v_readlane_b32 s65, v40, 17 +; FIJI-NEXT: v_readlane_b32 s64, v40, 16 +; FIJI-NEXT: v_readlane_b32 s55, v40, 15 +; FIJI-NEXT: v_readlane_b32 s54, v40, 14 +; FIJI-NEXT: v_readlane_b32 s53, v40, 13 +; FIJI-NEXT: v_readlane_b32 s52, v40, 12 +; FIJI-NEXT: v_readlane_b32 s51, v40, 11 +; FIJI-NEXT: v_readlane_b32 s50, v40, 10 +; FIJI-NEXT: v_readlane_b32 s49, v40, 9 +; FIJI-NEXT: v_readlane_b32 s48, v40, 8 ; FIJI-NEXT: v_readlane_b32 s39, v40, 7 ; FIJI-NEXT: v_readlane_b32 s38, v40, 6 ; FIJI-NEXT: v_readlane_b32 s37, v40, 5 @@ -706,40 +706,40 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; HAWAII-NEXT: v_writelane_b32 v40, s37, 5 ; HAWAII-NEXT: v_writelane_b32 v40, s38, 6 ; HAWAII-NEXT: v_writelane_b32 v40, s39, 7 -; HAWAII-NEXT: v_writelane_b32 v40, s40, 8 -; HAWAII-NEXT: v_writelane_b32 v40, s41, 9 -; HAWAII-NEXT: v_writelane_b32 v40, s42, 10 -; HAWAII-NEXT: v_writelane_b32 v40, s43, 11 -; HAWAII-NEXT: v_writelane_b32 v40, s44, 12 -; HAWAII-NEXT: v_writelane_b32 v40, s45, 13 -; HAWAII-NEXT: v_writelane_b32 v40, s46, 14 -; HAWAII-NEXT: v_writelane_b32 v40, s47, 15 -; HAWAII-NEXT: v_writelane_b32 v40, s48, 16 -; HAWAII-NEXT: s_mov_b32 s42, s15 -; HAWAII-NEXT: s_mov_b32 s43, s14 -; HAWAII-NEXT: s_mov_b32 s44, s13 -; HAWAII-NEXT: s_mov_b32 s45, s12 +; HAWAII-NEXT: v_writelane_b32 v40, s48, 8 +; HAWAII-NEXT: v_writelane_b32 v40, s49, 9 +; HAWAII-NEXT: v_writelane_b32 v40, s50, 10 +; HAWAII-NEXT: v_writelane_b32 v40, s51, 11 +; HAWAII-NEXT: v_writelane_b32 v40, s52, 12 +; HAWAII-NEXT: v_writelane_b32 v40, s53, 13 +; HAWAII-NEXT: v_writelane_b32 v40, s54, 14 +; HAWAII-NEXT: v_writelane_b32 v40, s55, 15 +; HAWAII-NEXT: v_writelane_b32 v40, s64, 16 +; HAWAII-NEXT: s_mov_b32 s50, s15 +; HAWAII-NEXT: s_mov_b32 s51, s14 +; HAWAII-NEXT: s_mov_b32 s52, s13 +; HAWAII-NEXT: s_mov_b32 s53, s12 ; HAWAII-NEXT: s_mov_b64 s[34:35], s[10:11] ; HAWAII-NEXT: s_mov_b64 s[36:37], s[8:9] ; HAWAII-NEXT: s_mov_b64 s[38:39], s[6:7] -; HAWAII-NEXT: s_mov_b64 s[40:41], s[4:5] +; HAWAII-NEXT: s_mov_b64 s[48:49], s[4:5] ; HAWAII-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; HAWAII-NEXT: s_mov_b64 s[46:47], exec +; HAWAII-NEXT: s_mov_b64 s[54:55], exec ; HAWAII-NEXT: s_addk_i32 s32, 0x400 -; HAWAII-NEXT: v_writelane_b32 v40, s49, 17 +; HAWAII-NEXT: v_writelane_b32 v40, s65, 17 ; HAWAII-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; HAWAII-NEXT: v_readfirstlane_b32 s16, v0 ; HAWAII-NEXT: v_readfirstlane_b32 s17, v1 ; HAWAII-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; HAWAII-NEXT: s_and_saveexec_b64 s[48:49], vcc -; HAWAII-NEXT: s_mov_b64 s[4:5], s[40:41] +; HAWAII-NEXT: s_and_saveexec_b64 s[64:65], vcc +; HAWAII-NEXT: s_mov_b64 s[4:5], s[48:49] ; HAWAII-NEXT: s_mov_b64 s[6:7], s[38:39] ; HAWAII-NEXT: s_mov_b64 s[8:9], s[36:37] ; HAWAII-NEXT: s_mov_b64 s[10:11], s[34:35] -; HAWAII-NEXT: s_mov_b32 s12, s45 -; HAWAII-NEXT: s_mov_b32 s13, s44 -; HAWAII-NEXT: s_mov_b32 s14, s43 -; HAWAII-NEXT: s_mov_b32 s15, s42 +; HAWAII-NEXT: s_mov_b32 s12, s53 +; HAWAII-NEXT: s_mov_b32 s13, s52 +; HAWAII-NEXT: s_mov_b32 s14, s51 +; HAWAII-NEXT: s_mov_b32 s15, s50 ; HAWAII-NEXT: v_mov_b32_e32 v0, v2 ; HAWAII-NEXT: v_mov_b32_e32 v1, v3 ; HAWAII-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -748,21 +748,21 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; HAWAII-NEXT: ; implicit-def: $vgpr31 ; HAWAII-NEXT: ; implicit-def: $vgpr2 ; HAWAII-NEXT: ; implicit-def: $vgpr3 -; HAWAII-NEXT: s_xor_b64 exec, exec, s[48:49] +; HAWAII-NEXT: s_xor_b64 exec, exec, s[64:65] ; HAWAII-NEXT: s_cbranch_execnz .LBB18_1 ; HAWAII-NEXT: ; %bb.2: -; HAWAII-NEXT: s_mov_b64 exec, s[46:47] +; HAWAII-NEXT: s_mov_b64 exec, s[54:55] ; HAWAII-NEXT: v_mov_b32_e32 v0, v4 -; HAWAII-NEXT: v_readlane_b32 s49, v40, 17 -; HAWAII-NEXT: v_readlane_b32 s48, v40, 16 -; HAWAII-NEXT: v_readlane_b32 s47, v40, 15 -; HAWAII-NEXT: v_readlane_b32 s46, v40, 14 -; HAWAII-NEXT: v_readlane_b32 s45, v40, 13 -; HAWAII-NEXT: v_readlane_b32 s44, v40, 12 -; HAWAII-NEXT: v_readlane_b32 s43, v40, 11 -; HAWAII-NEXT: v_readlane_b32 s42, v40, 10 -; HAWAII-NEXT: v_readlane_b32 s41, v40, 9 -; HAWAII-NEXT: v_readlane_b32 s40, v40, 8 +; HAWAII-NEXT: v_readlane_b32 s65, v40, 17 +; HAWAII-NEXT: v_readlane_b32 s64, v40, 16 +; HAWAII-NEXT: v_readlane_b32 s55, v40, 15 +; HAWAII-NEXT: v_readlane_b32 s54, v40, 14 +; HAWAII-NEXT: v_readlane_b32 s53, v40, 13 +; HAWAII-NEXT: v_readlane_b32 s52, v40, 12 +; HAWAII-NEXT: v_readlane_b32 s51, v40, 11 +; HAWAII-NEXT: v_readlane_b32 s50, v40, 10 +; HAWAII-NEXT: v_readlane_b32 s49, v40, 9 +; HAWAII-NEXT: v_readlane_b32 s48, v40, 8 ; HAWAII-NEXT: v_readlane_b32 s39, v40, 7 ; HAWAII-NEXT: v_readlane_b32 s38, v40, 6 ; HAWAII-NEXT: v_readlane_b32 s37, v40, 5 @@ -797,40 +797,40 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; GFX9-NEXT: v_writelane_b32 v40, s37, 5 ; GFX9-NEXT: v_writelane_b32 v40, s38, 6 ; GFX9-NEXT: v_writelane_b32 v40, s39, 7 -; GFX9-NEXT: v_writelane_b32 v40, s40, 8 -; GFX9-NEXT: v_writelane_b32 v40, s41, 9 -; GFX9-NEXT: v_writelane_b32 v40, s42, 10 -; GFX9-NEXT: v_writelane_b32 v40, s43, 11 -; GFX9-NEXT: v_writelane_b32 v40, s44, 12 -; GFX9-NEXT: v_writelane_b32 v40, s45, 13 -; GFX9-NEXT: v_writelane_b32 v40, s46, 14 -; GFX9-NEXT: v_writelane_b32 v40, s47, 15 -; GFX9-NEXT: v_writelane_b32 v40, s48, 16 -; GFX9-NEXT: s_mov_b32 s42, s15 -; GFX9-NEXT: s_mov_b32 s43, s14 -; GFX9-NEXT: s_mov_b32 s44, s13 -; GFX9-NEXT: s_mov_b32 s45, s12 +; GFX9-NEXT: v_writelane_b32 v40, s48, 8 +; GFX9-NEXT: v_writelane_b32 v40, s49, 9 +; GFX9-NEXT: v_writelane_b32 v40, s50, 10 +; GFX9-NEXT: v_writelane_b32 v40, s51, 11 +; GFX9-NEXT: v_writelane_b32 v40, s52, 12 +; GFX9-NEXT: v_writelane_b32 v40, s53, 13 +; GFX9-NEXT: v_writelane_b32 v40, s54, 14 +; GFX9-NEXT: v_writelane_b32 v40, s55, 15 +; GFX9-NEXT: v_writelane_b32 v40, s64, 16 +; GFX9-NEXT: s_mov_b32 s50, s15 +; GFX9-NEXT: s_mov_b32 s51, s14 +; GFX9-NEXT: s_mov_b32 s52, s13 +; GFX9-NEXT: s_mov_b32 s53, s12 ; GFX9-NEXT: s_mov_b64 s[34:35], s[10:11] ; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9] ; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7] -; GFX9-NEXT: s_mov_b64 s[40:41], s[4:5] +; GFX9-NEXT: s_mov_b64 s[48:49], s[4:5] ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 -; GFX9-NEXT: s_mov_b64 s[46:47], exec +; GFX9-NEXT: s_mov_b64 s[54:55], exec ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s49, 17 +; GFX9-NEXT: v_writelane_b32 v40, s65, 17 ; GFX9-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_readfirstlane_b32 s16, v0 ; GFX9-NEXT: v_readfirstlane_b32 s17, v1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_and_saveexec_b64 s[64:65], vcc +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s45 -; GFX9-NEXT: s_mov_b32 s13, s44 -; GFX9-NEXT: s_mov_b32 s14, s43 -; GFX9-NEXT: s_mov_b32 s15, s42 +; GFX9-NEXT: s_mov_b32 s12, s53 +; GFX9-NEXT: s_mov_b32 s13, s52 +; GFX9-NEXT: s_mov_b32 s14, s51 +; GFX9-NEXT: s_mov_b32 s15, s50 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -839,21 +839,21 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr3 -; GFX9-NEXT: s_xor_b64 exec, exec, s[48:49] +; GFX9-NEXT: s_xor_b64 exec, exec, s[64:65] ; GFX9-NEXT: s_cbranch_execnz .LBB18_1 ; GFX9-NEXT: ; %bb.2: -; GFX9-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-NEXT: s_mov_b64 exec, s[54:55] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_readlane_b32 s49, v40, 17 -; GFX9-NEXT: v_readlane_b32 s48, v40, 16 -; GFX9-NEXT: v_readlane_b32 s47, v40, 15 -; GFX9-NEXT: v_readlane_b32 s46, v40, 14 -; GFX9-NEXT: v_readlane_b32 s45, v40, 13 -; GFX9-NEXT: v_readlane_b32 s44, v40, 12 -; GFX9-NEXT: v_readlane_b32 s43, v40, 11 -; GFX9-NEXT: v_readlane_b32 s42, v40, 10 -; GFX9-NEXT: v_readlane_b32 s41, v40, 9 -; GFX9-NEXT: v_readlane_b32 s40, v40, 8 +; GFX9-NEXT: v_readlane_b32 s65, v40, 17 +; GFX9-NEXT: v_readlane_b32 s64, v40, 16 +; GFX9-NEXT: v_readlane_b32 s55, v40, 15 +; GFX9-NEXT: v_readlane_b32 s54, v40, 14 +; GFX9-NEXT: v_readlane_b32 s53, v40, 13 +; GFX9-NEXT: v_readlane_b32 s52, v40, 12 +; GFX9-NEXT: v_readlane_b32 s51, v40, 11 +; GFX9-NEXT: v_readlane_b32 s50, v40, 10 +; GFX9-NEXT: v_readlane_b32 s49, v40, 9 +; GFX9-NEXT: v_readlane_b32 s48, v40, 8 ; GFX9-NEXT: v_readlane_b32 s39, v40, 7 ; GFX9-NEXT: v_readlane_b32 s38, v40, 6 ; GFX9-NEXT: v_readlane_b32 s37, v40, 5 diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll index 0b9790463df10..8f09d6b9a5771 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll @@ -3,8 +3,6 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes='amdgpu-attributor' %s | FileCheck --check-prefixes=CHECK,CW %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes='amdgpu-attributor' -amdgpu-indirect-call-specialization-threshold=0 %s | FileCheck --check-prefixes=CHECK,NO %s -target datalayout = "A5" - @G = global i32 0, align 4 ;. diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index b01640350dd5a..1861b080f0e0d 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -1,15 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=AKF_GCN %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s -target datalayout = "A5" - define internal void @indirect() { -; AKF_GCN-LABEL: define {{[^@]+}}@indirect() { -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@indirect ; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: ret void @@ -22,15 +16,6 @@ define internal void @indirect() { } define amdgpu_kernel void @test_simple_indirect_call() { -; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call -; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] { -; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) -; AKF_GCN-NEXT: [[FPTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR]] to ptr -; AKF_GCN-NEXT: store ptr @indirect, ptr [[FPTR_CAST]], align 8 -; AKF_GCN-NEXT: [[FP:%.*]] = load ptr, ptr [[FPTR_CAST]], align 8 -; AKF_GCN-NEXT: call void [[FP]]() -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call ; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -75,15 +60,7 @@ define amdgpu_kernel void @test_simple_indirect_call() { } -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} -;. -; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. -; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} -;. -; ATTRIBUTOR_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} -;. diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir index 080bd052a7391..cf23a9d1e8a57 100644 --- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir @@ -34,57 +34,56 @@ body: | ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr34_sgpr35 = IMPLICIT_DEF ; CHECK-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $sgpr41 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr35 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr38_sgpr39 = COPY undef $sgpr8_sgpr9 ; CHECK-NEXT: renamable $sgpr36_sgpr37 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4) + ; CHECK-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4) ; CHECK-NEXT: dead renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr38_sgpr39, 48, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) - ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM renamable $sgpr44_sgpr45, 0, 0 :: (invariant load (s64), align 16, addrspace 4) + ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM renamable $sgpr48_sgpr49, 0, 0 :: (invariant load (s64), align 16, addrspace 4) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $vgpr1 = COPY renamable $sgpr51 + ; CHECK-NEXT: $vgpr1 = COPY renamable $sgpr55 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $vcc = COPY renamable $sgpr40_sgpr41 + ; CHECK-NEXT: $vcc = COPY renamable $sgpr34_sgpr35 ; CHECK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4) ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4) ; CHECK-NEXT: S_CMP_LG_U64 renamable $sgpr4_sgpr5, 0, implicit-def $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.4(0x40000000) - ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_VCCZ %bb.5, implicit undef $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) - ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CMP_EQ_U32 renamable $sgpr8, 0, implicit-def $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: - ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr38_sgpr39, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4) ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1) - ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], renamable $sgpr50_sgpr51, 0, 0, implicit $exec :: (store (s32), addrspace 1) - ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr49 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], renamable $sgpr54_sgpr55, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr53 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $sgpr6_sgpr7 = COPY killed renamable $sgpr36_sgpr37 - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY killed renamable $sgpr34_sgpr35 + ; CHECK-NEXT: renamable $sgpr10_sgpr11 = IMPLICIT_DEF ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: S_ENDPGM 0 bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir index 3aead9eeb01ce..02189aa23370c 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir @@ -53,7 +53,7 @@ body: | bb.0: liveins: $sgpr30_sgpr31, $sgpr10, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN-LABEL: name: sgpr_spill_lane_crossover - ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr63, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $vgpr63, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr64, 0, $vgpr63 ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr64, $vgpr63, 0, 32 @@ -71,54 +71,22 @@ body: | ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr70, $vgpr63, 6, 32 ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr71, 7, $vgpr63 ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr71, $vgpr63, 7, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr72, 8, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr72, $vgpr63, 8, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr73, 9, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr73, $vgpr63, 9, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr74, 10, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr74, $vgpr63, 10, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr75, 11, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr75, $vgpr63, 11, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr76, 12, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr76, $vgpr63, 12, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr77, 13, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr77, $vgpr63, 13, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr78, 14, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr78, $vgpr63, 14, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr79, 15, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr79, $vgpr63, 15, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr80, 16, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr80, $vgpr63, 16, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr81, 17, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr81, $vgpr63, 17, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr82, 18, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr82, $vgpr63, 18, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr83, 19, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr83, $vgpr63, 19, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr84, 20, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr84, $vgpr63, 20, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr85, 21, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr85, $vgpr63, 21, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr86, 22, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr86, $vgpr63, 22, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr87, 23, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr87, $vgpr63, 23, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr88, 24, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr88, $vgpr63, 24, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr89, 25, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr89, $vgpr63, 25, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr90, 26, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr90, $vgpr63, 26, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr91, 27, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr91, $vgpr63, 27, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr92, 28, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr92, $vgpr63, 28, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr93, 29, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr93, $vgpr63, 29, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr94, 30, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr94, $vgpr63, 30, 32 - ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 31, $vgpr63 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr95, $vgpr63, 31, 32 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr80, 8, $vgpr63 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr80, $vgpr63, 8, 32 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr81, 9, $vgpr63 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr81, $vgpr63, 9, 32 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr82, 10, $vgpr63 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr82, $vgpr63, 10, 32 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr83, 11, $vgpr63 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr83, $vgpr63, 11, 32 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr84, 12, $vgpr63 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr84, $vgpr63, 12, 32 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr85, 13, $vgpr63 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr85, $vgpr63, 13, 32 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr86, 14, $vgpr63 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr86, $vgpr63, 14, 32 + ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr87, 15, $vgpr63 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr87, $vgpr63, 15, 32 ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]] diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir index c9a7e6331c252..cfa09c149e4c6 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir @@ -62,7 +62,7 @@ body: | bb.0: liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr30_sgpr31, $vcc, $vgpr0 ; GCN-LABEL: name: spill_exec_copy_reserved_reg - ; GCN: liveins: $vcc, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $vgpr0, $vgpr2, $sgpr30_sgpr31 + ; GCN: liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr30_sgpr31, $vcc, $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6 ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 @@ -70,21 +70,15 @@ body: | ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr1 ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 - ; GCN-NEXT: $sgpr28_sgpr29 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr2, 0 - ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr28_sgpr29 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 0, undef $vgpr2 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr34, $vgpr2, 0, 32 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr35, 1, undef $vgpr2 - ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr35, $vgpr2, 1, 32 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, killed $vgpr0 ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, killed $vgpr0 - ; GCN-NEXT: $sgpr34_sgpr35 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $sgpr40_sgpr41 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $sgpr28_sgpr29 = IMPLICIT_DEF ; GCN-NEXT: $vgpr1 = COPY $vgpr0 ; GCN-NEXT: S_NOP 0, implicit $sgpr28_sgpr29 - ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr34_sgpr35 + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr40_sgpr41 ; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0 ; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1 ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8_sgpr9_sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr15, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $vcc diff --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll index d2b960fe43f84..68516dc8207f4 100644 --- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll @@ -1,13 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -enable-var-scope %s -; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs: -; CHECK-DAG: v_writelane_b32 v0, s98, 63 -; CHECK-DAG: v_writelane_b32 v1, s99, 0 -; CHECK-NOT: dummy -; CHECK-DAG: v_readlane_b32 s99, v1, 0 -; CHECK-DAG: v_readlane_b32 s98, v0, 63 - define void @spill_more_than_wavesize_csr_sgprs() { +; CHECK-LABEL: spill_more_than_wavesize_csr_sgprs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: v_writelane_b32 v0, s35, 0 +; CHECK-NEXT: v_writelane_b32 v0, s36, 1 +; CHECK-NEXT: v_writelane_b32 v0, s37, 2 +; CHECK-NEXT: v_writelane_b32 v0, s38, 3 +; CHECK-NEXT: v_writelane_b32 v0, s39, 4 +; CHECK-NEXT: v_writelane_b32 v0, s48, 5 +; CHECK-NEXT: v_writelane_b32 v0, s49, 6 +; CHECK-NEXT: v_writelane_b32 v0, s50, 7 +; CHECK-NEXT: v_writelane_b32 v0, s51, 8 +; CHECK-NEXT: v_writelane_b32 v0, s52, 9 +; CHECK-NEXT: v_writelane_b32 v0, s53, 10 +; CHECK-NEXT: v_writelane_b32 v0, s54, 11 +; CHECK-NEXT: v_writelane_b32 v0, s55, 12 +; CHECK-NEXT: v_writelane_b32 v0, s64, 13 +; CHECK-NEXT: v_writelane_b32 v0, s65, 14 +; CHECK-NEXT: v_writelane_b32 v0, s66, 15 +; CHECK-NEXT: v_writelane_b32 v0, s67, 16 +; CHECK-NEXT: v_writelane_b32 v0, s68, 17 +; CHECK-NEXT: v_writelane_b32 v0, s69, 18 +; CHECK-NEXT: v_writelane_b32 v0, s70, 19 +; CHECK-NEXT: v_writelane_b32 v0, s71, 20 +; CHECK-NEXT: v_writelane_b32 v0, s80, 21 +; CHECK-NEXT: v_writelane_b32 v0, s81, 22 +; CHECK-NEXT: v_writelane_b32 v0, s82, 23 +; CHECK-NEXT: v_writelane_b32 v0, s83, 24 +; CHECK-NEXT: v_writelane_b32 v0, s84, 25 +; CHECK-NEXT: v_writelane_b32 v0, s85, 26 +; CHECK-NEXT: v_writelane_b32 v0, s86, 27 +; CHECK-NEXT: v_writelane_b32 v0, s87, 28 +; CHECK-NEXT: v_writelane_b32 v0, s96, 29 +; CHECK-NEXT: v_writelane_b32 v0, s97, 30 +; CHECK-NEXT: v_writelane_b32 v0, s98, 31 +; CHECK-NEXT: v_writelane_b32 v0, s99, 32 +; CHECK-NEXT: v_writelane_b32 v0, s100, 33 +; CHECK-NEXT: v_writelane_b32 v0, s101, 34 +; CHECK-NEXT: v_writelane_b32 v0, s102, 35 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s102, v0, 35 +; CHECK-NEXT: v_readlane_b32 s101, v0, 34 +; CHECK-NEXT: v_readlane_b32 s100, v0, 33 +; CHECK-NEXT: v_readlane_b32 s99, v0, 32 +; CHECK-NEXT: v_readlane_b32 s98, v0, 31 +; CHECK-NEXT: v_readlane_b32 s97, v0, 30 +; CHECK-NEXT: v_readlane_b32 s96, v0, 29 +; CHECK-NEXT: v_readlane_b32 s87, v0, 28 +; CHECK-NEXT: v_readlane_b32 s86, v0, 27 +; CHECK-NEXT: v_readlane_b32 s85, v0, 26 +; CHECK-NEXT: v_readlane_b32 s84, v0, 25 +; CHECK-NEXT: v_readlane_b32 s83, v0, 24 +; CHECK-NEXT: v_readlane_b32 s82, v0, 23 +; CHECK-NEXT: v_readlane_b32 s81, v0, 22 +; CHECK-NEXT: v_readlane_b32 s80, v0, 21 +; CHECK-NEXT: v_readlane_b32 s71, v0, 20 +; CHECK-NEXT: v_readlane_b32 s70, v0, 19 +; CHECK-NEXT: v_readlane_b32 s69, v0, 18 +; CHECK-NEXT: v_readlane_b32 s68, v0, 17 +; CHECK-NEXT: v_readlane_b32 s67, v0, 16 +; CHECK-NEXT: v_readlane_b32 s66, v0, 15 +; CHECK-NEXT: v_readlane_b32 s65, v0, 14 +; CHECK-NEXT: v_readlane_b32 s64, v0, 13 +; CHECK-NEXT: v_readlane_b32 s55, v0, 12 +; CHECK-NEXT: v_readlane_b32 s54, v0, 11 +; CHECK-NEXT: v_readlane_b32 s53, v0, 10 +; CHECK-NEXT: v_readlane_b32 s52, v0, 9 +; CHECK-NEXT: v_readlane_b32 s51, v0, 8 +; CHECK-NEXT: v_readlane_b32 s50, v0, 7 +; CHECK-NEXT: v_readlane_b32 s49, v0, 6 +; CHECK-NEXT: v_readlane_b32 s48, v0, 5 +; CHECK-NEXT: v_readlane_b32 s39, v0, 4 +; CHECK-NEXT: v_readlane_b32 s38, v0, 3 +; CHECK-NEXT: v_readlane_b32 s37, v0, 2 +; CHECK-NEXT: v_readlane_b32 s36, v0, 1 +; CHECK-NEXT: v_readlane_b32 s35, v0, 0 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{s35},~{s36},~{s37},~{s38},~{s39},~{s40},~{s41},~{s42} ,~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49},~{s50} @@ -21,13 +100,95 @@ define void @spill_more_than_wavesize_csr_sgprs() { ret void } -; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs_with_stack_object: -; CHECK-DAG: v_writelane_b32 v1, s98, 63 -; CHECK-DAG: v_writelane_b32 v2, s99, 0 -; CHECK-NOT: dummy -; CHECK-DAG: v_readlane_b32 s99, v2, 0 -; CHECK-DAG: v_readlane_b32 s98, v1, 63 define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() { +; CHECK-LABEL: spill_more_than_wavesize_csr_sgprs_with_stack_object: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: v_writelane_b32 v1, s35, 0 +; CHECK-NEXT: v_writelane_b32 v1, s36, 1 +; CHECK-NEXT: v_writelane_b32 v1, s37, 2 +; CHECK-NEXT: v_writelane_b32 v1, s38, 3 +; CHECK-NEXT: v_writelane_b32 v1, s39, 4 +; CHECK-NEXT: v_writelane_b32 v1, s48, 5 +; CHECK-NEXT: v_writelane_b32 v1, s49, 6 +; CHECK-NEXT: v_writelane_b32 v1, s50, 7 +; CHECK-NEXT: v_writelane_b32 v1, s51, 8 +; CHECK-NEXT: v_writelane_b32 v1, s52, 9 +; CHECK-NEXT: v_writelane_b32 v1, s53, 10 +; CHECK-NEXT: v_writelane_b32 v1, s54, 11 +; CHECK-NEXT: v_writelane_b32 v1, s55, 12 +; CHECK-NEXT: v_writelane_b32 v1, s64, 13 +; CHECK-NEXT: v_writelane_b32 v1, s65, 14 +; CHECK-NEXT: v_writelane_b32 v1, s66, 15 +; CHECK-NEXT: v_writelane_b32 v1, s67, 16 +; CHECK-NEXT: v_writelane_b32 v1, s68, 17 +; CHECK-NEXT: v_writelane_b32 v1, s69, 18 +; CHECK-NEXT: v_writelane_b32 v1, s70, 19 +; CHECK-NEXT: v_writelane_b32 v1, s71, 20 +; CHECK-NEXT: v_writelane_b32 v1, s80, 21 +; CHECK-NEXT: v_writelane_b32 v1, s81, 22 +; CHECK-NEXT: v_writelane_b32 v1, s82, 23 +; CHECK-NEXT: v_writelane_b32 v1, s83, 24 +; CHECK-NEXT: v_writelane_b32 v1, s84, 25 +; CHECK-NEXT: v_writelane_b32 v1, s85, 26 +; CHECK-NEXT: v_writelane_b32 v1, s86, 27 +; CHECK-NEXT: v_writelane_b32 v1, s87, 28 +; CHECK-NEXT: v_writelane_b32 v1, s96, 29 +; CHECK-NEXT: v_writelane_b32 v1, s97, 30 +; CHECK-NEXT: v_writelane_b32 v1, s98, 31 +; CHECK-NEXT: v_writelane_b32 v1, s99, 32 +; CHECK-NEXT: v_writelane_b32 v1, s100, 33 +; CHECK-NEXT: v_writelane_b32 v1, s101, 34 +; CHECK-NEXT: v_writelane_b32 v1, s102, 35 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s102, v1, 35 +; CHECK-NEXT: v_readlane_b32 s101, v1, 34 +; CHECK-NEXT: v_readlane_b32 s100, v1, 33 +; CHECK-NEXT: v_readlane_b32 s99, v1, 32 +; CHECK-NEXT: v_readlane_b32 s98, v1, 31 +; CHECK-NEXT: v_readlane_b32 s97, v1, 30 +; CHECK-NEXT: v_readlane_b32 s96, v1, 29 +; CHECK-NEXT: v_readlane_b32 s87, v1, 28 +; CHECK-NEXT: v_readlane_b32 s86, v1, 27 +; CHECK-NEXT: v_readlane_b32 s85, v1, 26 +; CHECK-NEXT: v_readlane_b32 s84, v1, 25 +; CHECK-NEXT: v_readlane_b32 s83, v1, 24 +; CHECK-NEXT: v_readlane_b32 s82, v1, 23 +; CHECK-NEXT: v_readlane_b32 s81, v1, 22 +; CHECK-NEXT: v_readlane_b32 s80, v1, 21 +; CHECK-NEXT: v_readlane_b32 s71, v1, 20 +; CHECK-NEXT: v_readlane_b32 s70, v1, 19 +; CHECK-NEXT: v_readlane_b32 s69, v1, 18 +; CHECK-NEXT: v_readlane_b32 s68, v1, 17 +; CHECK-NEXT: v_readlane_b32 s67, v1, 16 +; CHECK-NEXT: v_readlane_b32 s66, v1, 15 +; CHECK-NEXT: v_readlane_b32 s65, v1, 14 +; CHECK-NEXT: v_readlane_b32 s64, v1, 13 +; CHECK-NEXT: v_readlane_b32 s55, v1, 12 +; CHECK-NEXT: v_readlane_b32 s54, v1, 11 +; CHECK-NEXT: v_readlane_b32 s53, v1, 10 +; CHECK-NEXT: v_readlane_b32 s52, v1, 9 +; CHECK-NEXT: v_readlane_b32 s51, v1, 8 +; CHECK-NEXT: v_readlane_b32 s50, v1, 7 +; CHECK-NEXT: v_readlane_b32 s49, v1, 6 +; CHECK-NEXT: v_readlane_b32 s48, v1, 5 +; CHECK-NEXT: v_readlane_b32 s39, v1, 4 +; CHECK-NEXT: v_readlane_b32 s38, v1, 3 +; CHECK-NEXT: v_readlane_b32 s37, v1, 2 +; CHECK-NEXT: v_readlane_b32 s36, v1, 1 +; CHECK-NEXT: v_readlane_b32 s35, v1, 0 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void asm sideeffect "", diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir index 8f53ec2f992da..359152e9d2b45 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir @@ -17,70 +17,78 @@ body: | ; RA-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; RA-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_1024 = S_MOV_B32 -1 ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_1024 = S_MOV_B32 -1 + ; RA-NEXT: SI_SPILL_S1024_SAVE [[S_MOV_B32_]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5) ; RA-NEXT: undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_1024 = S_MOV_B32 0 + ; RA-NEXT: SI_SPILL_S1024_SAVE [[S_MOV_B32_1]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) ; RA-NEXT: {{ $}} ; RA-NEXT: bb.1: ; RA-NEXT: successors: %bb.2(0x80000000) ; RA-NEXT: {{ $}} - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub4:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub5:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub6:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub7:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub8:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub9:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub10:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub11:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub12:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub13:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub14:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub15:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub16:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub17:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub18:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub19:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub20:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub21:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub22:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub23:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub24:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub25:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub26:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub27:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub28:sgpr_1024 = COPY [[S_MOV_B32_]].sub0 - ; RA-NEXT: [[S_MOV_B32_:%[0-9]+]].sub29:sgpr_1024 = COPY [[S_MOV_B32_]].sub1 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub1:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub2:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub3:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub4:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub5:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub6:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub7:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub8:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub9:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub10:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub11:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub12:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub13:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub14:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub15:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub16:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub17:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub18:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub19:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub20:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub21:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub22:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub23:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub24:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub25:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub26:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub27:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub28:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub29:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub30:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 - ; RA-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub31:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0 + ; RA-NEXT: [[SI_SPILL_S1024_RESTORE:%[0-9]+]]:sgpr_1024 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; RA-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:sgpr_1024 = COPY [[SI_SPILL_S1024_RESTORE]].sub0_sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub2:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub3:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub4:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub5:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub6:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub7:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub8:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub9:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub10:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub11:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub12:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub13:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub14:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub15:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub16:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub17:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub18:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub19:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub20:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub21:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub22:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub23:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub24:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub25:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub26:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub27:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: [[COPY:%[0-9]+]].sub28:sgpr_1024 = COPY [[COPY]].sub0 + ; RA-NEXT: [[COPY:%[0-9]+]].sub29:sgpr_1024 = COPY [[COPY]].sub1 + ; RA-NEXT: SI_SPILL_S1024_SAVE [[COPY]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; RA-NEXT: [[SI_SPILL_S1024_RESTORE1:%[0-9]+]]:sgpr_1024 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; RA-NEXT: undef [[COPY1:%[0-9]+]].sub0:sgpr_1024 = COPY [[SI_SPILL_S1024_RESTORE1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub1:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub2:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub3:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub4:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub5:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub6:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub7:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub8:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub9:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub10:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub11:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub12:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub13:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub14:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub15:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub16:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub17:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub18:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub19:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub20:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub21:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub22:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub23:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub24:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub25:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub26:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub27:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub28:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub29:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub30:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: [[COPY1:%[0-9]+]].sub31:sgpr_1024 = COPY [[COPY1]].sub0 + ; RA-NEXT: SI_SPILL_S1024_SAVE [[COPY1]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) ; RA-NEXT: {{ $}} ; RA-NEXT: bb.2: ; RA-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) @@ -95,14 +103,17 @@ body: | ; VR-NEXT: {{ $}} ; VR-NEXT: renamable $sgpr37 = S_MOV_B32 -1 ; VR-NEXT: renamable $sgpr36 = S_MOV_B32 -1 - ; VR-NEXT: renamable $sgpr68 = S_MOV_B32 0 + ; VR-NEXT: SI_SPILL_S1024_SAVE renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; VR-NEXT: renamable $sgpr36 = S_MOV_B32 0 ; VR-NEXT: renamable $sgpr30_sgpr31 = IMPLICIT_DEF ; VR-NEXT: renamable $sgpr34_sgpr35 = IMPLICIT_DEF + ; VR-NEXT: SI_SPILL_S1024_SAVE killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) ; VR-NEXT: {{ $}} ; VR-NEXT: bb.1: ; VR-NEXT: successors: %bb.2(0x80000000) - ; VR-NEXT: liveins: $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003 + ; VR-NEXT: liveins: $sgpr30_sgpr31, $sgpr34_sgpr35 ; VR-NEXT: {{ $}} + ; VR-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) ; VR-NEXT: renamable $sgpr38 = COPY renamable $sgpr36 ; VR-NEXT: renamable $sgpr39 = COPY renamable $sgpr37 ; VR-NEXT: renamable $sgpr40 = COPY renamable $sgpr36 @@ -131,41 +142,44 @@ body: | ; VR-NEXT: renamable $sgpr63 = COPY renamable $sgpr37 ; VR-NEXT: renamable $sgpr64 = COPY renamable $sgpr36 ; VR-NEXT: renamable $sgpr65 = COPY renamable $sgpr37 - ; VR-NEXT: renamable $sgpr69 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr70 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr71 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr72 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr73 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr74 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr75 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr76 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr77 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr78 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr79 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr80 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr81 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr82 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr83 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr84 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr85 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr86 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr87 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr88 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr89 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr90 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr91 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr92 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr93 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr94 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr95 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr96 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr97 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr98 = COPY renamable $sgpr68 - ; VR-NEXT: renamable $sgpr99 = COPY renamable $sgpr68 + ; VR-NEXT: SI_SPILL_S1024_SAVE killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; VR-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; VR-NEXT: renamable $sgpr37 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr38 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr39 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr40 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr41 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr42 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr43 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr44 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr45 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr46 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr47 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr48 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr49 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr50 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr51 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr52 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr53 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr54 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr55 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr56 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr57 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr58 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr59 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr60 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr61 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr62 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr63 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr64 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr65 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr66 = COPY renamable $sgpr36 + ; VR-NEXT: renamable $sgpr67 = COPY renamable $sgpr36 + ; VR-NEXT: SI_SPILL_S1024_SAVE killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) ; VR-NEXT: {{ $}} ; VR-NEXT: bb.2: ; VR-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; VR-NEXT: liveins: $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003 + ; VR-NEXT: liveins: $sgpr30_sgpr31, $sgpr34_sgpr35 ; VR-NEXT: {{ $}} ; VR-NEXT: S_NOP 0, csr_amdgpu, implicit renamable $sgpr30_sgpr31, implicit renamable $sgpr34_sgpr35 ; VR-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 70b1f1366eac5..ca3ea8f8ab800 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -1,7 +1,6 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; XFAIL: * - ; Check that we properly realign the stack. While 4-byte access is all ; that is ever needed, some transformations rely on the known bits from the alignment of the pointer (e.g. @@ -10,92 +9,164 @@ ; 4 byte emergency stack slot ; = 144 bytes with padding between them -; GCN-LABEL: {{^}}needs_align16_default_stack_align: -; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, v0 -; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, s32 -; GCN: v_add_u32_e32 [[FI:v[0-9]+]], vcc, [[SCALED_IDX]], [[FRAMEDIFF]] - -; GCN-NOT: s32 - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen - -; GCN-NOT: s32 - -; GCN: ; ScratchSize: 144 define void @needs_align16_default_stack_align(i32 %idx) #0 { +; GCN-LABEL: needs_align16_default_stack_align: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s32 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 4 +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 16 ret void } -; GCN-LABEL: {{^}}needs_align16_stack_align4: -; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}} -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xfffffc00 - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0x2800{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen - -; GCN: s_mov_b32 s32, s34 - -; GCN: ; ScratchSize: 160 define void @needs_align16_stack_align4(i32 %idx) #2 { +; GCN-LABEL: needs_align16_stack_align4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x3c0 +; GCN-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 4 +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: s_addk_i32 s32, 0x2800 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 16 ret void } -; GCN-LABEL: {{^}}needs_align32: -; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}} -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xfffff800 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0x3000{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen - -; GCN: s_mov_b32 s32, s34 - -; GCN: ; ScratchSize: 192 define void @needs_align32(i32 %idx) #0 { +; GCN-LABEL: needs_align32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x7c0 +; GCN-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 4 +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: s_addk_i32 s32, 0x3000 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 32 ret void } -; GCN-LABEL: {{^}}force_realign4: -; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}} -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffffff00 -; GCN: s_addk_i32 s32, 0xd00{{$}} - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_mov_b32 s32, s34 - -; GCN: ; ScratchSize: 52 define void @force_realign4(i32 %idx) #1 { +; GCN-LABEL: force_realign4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0xc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffff00 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GCN-NEXT: s_addk_i32 s32, 0xd00 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, 3 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca.align16 = alloca [8 x i32], align 4, addrspace(5) %gep0 = getelementptr inbounds [8 x i32], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile i32 3, ptr addrspace(5) %gep0, align 4 ret void } -; GCN-LABEL: {{^}}kernel_call_align16_from_8: -; GCN: s_movk_i32 s32, 0x400{{$}} -; GCN-NOT: s32 -; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_8() #0 { +; GCN-LABEL: kernel_call_align16_from_8: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, needs_align16_default_stack_align@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_default_stack_align@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm %alloca = alloca i32, align 4, addrspace(5) store volatile i32 2, ptr addrspace(5) %alloca call void @needs_align16_default_stack_align(i32 1) @@ -103,10 +174,32 @@ define amdgpu_kernel void @kernel_call_align16_from_8() #0 { } ; The call sequence should keep the stack on call aligned to 4 -; GCN-LABEL: {{^}}kernel_call_align16_from_5: -; GCN: s_movk_i32 s32, 0x400 -; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_5() { +; GCN-LABEL: kernel_call_align16_from_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, needs_align16_default_stack_align@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_default_stack_align@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_byte v3, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm %alloca0 = alloca i8, align 1, addrspace(5) store volatile i8 2, ptr addrspace(5) %alloca0 @@ -114,10 +207,32 @@ define amdgpu_kernel void @kernel_call_align16_from_5() { ret void } -; GCN-LABEL: {{^}}kernel_call_align4_from_5: -; GCN: s_movk_i32 s32, 0x400 -; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align4_from_5() { +; GCN-LABEL: kernel_call_align4_from_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, needs_align16_stack_align4@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_stack_align4@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_byte v3, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm %alloca0 = alloca i8, align 1, addrspace(5) store volatile i8 2, ptr addrspace(5) %alloca0 @@ -125,28 +240,36 @@ define amdgpu_kernel void @kernel_call_align4_from_5() { ret void } -; GCN-LABEL: {{^}}default_realign_align128: -; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GCN-NEXT: s_mov_b32 s5, s34 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: s_addk_i32 s32, 0x4000 -; GCN-NOT: s33 -; GCN: buffer_store_dword v0, off, s[0:3], s33{{$}} -; GCN: s_mov_b32 s32, s34 -; GCN: s_mov_b32 s33, [[FP_COPY]] define void @default_realign_align128(i32 %idx) #0 { +; GCN-LABEL: default_realign_align128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0x4000 +; GCN-NEXT: v_mov_b32_e32 v0, 9 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, ptr addrspace(5) %alloca.align, align 128 ret void } -; GCN-LABEL: {{^}}disable_realign_align128: -; GCN-NOT: s32 -; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} -; GCN-NOT: s32 define void @disable_realign_align128(i32 %idx) #3 { +; GCN-LABEL: disable_realign_align128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 9 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, ptr addrspace(5) %alloca.align, align 128 ret void @@ -158,35 +281,48 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; since there is a local object with an alignment of 1024. ; Should use BP to access the incoming stack arguments. ; The BP value is saved/restored with a VGPR spill. - ; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 -; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 -; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 -; GCN: s_mov_b32 s34, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 -; GCN-DAG: s_add_i32 s32, s32, 0x30000 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 -; GCN: s_swappc_b64 s[30:31], +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s16, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v40, s16, 2 +; GCN-NEXT: v_writelane_b32 v40, s34, 3 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: s_add_i32 s32, s32, 0x30000 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s34 offset:4 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: v_readlane_b32 s34, v40, 3 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1 -; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0 -; GCN-NEXT: s_mov_b32 s32, s34 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2 -; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN: s_setpc_b64 s[30:31] %temp = alloca i32, align 1024, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 1024 call void @extern_func(<32 x i32> %a, i32 %b) @@ -200,23 +336,56 @@ define i32 @needs_align1024_stack_args_used_inside_loop(ptr addrspace(5) nocaptu ; index variable, the base pointer first get loaded into a VGPR ; and that value should be further referenced to load the incoming values. ; The BP value will get saved/restored in an SGPR at the prolgoue/epilogue. - ; GCN-LABEL: needs_align1024_stack_args_used_inside_loop: -; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 -; GCN-NEXT: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 -; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34 -; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0 -; GCN: s_add_i32 s32, s32, 0x30000 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024 -; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen -; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]] -; GCN: s_mov_b32 s32, s34 -; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]] -; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; %bb.0: ; %begin +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s11, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 +; GCN-NEXT: s_mov_b32 s14, s34 +; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s34 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_add_i32 s32, s32, 0x30000 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1024 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GCN-NEXT: s_branch .LBB10_2 +; GCN-NEXT: .LBB10_1: ; %Flow +; GCN-NEXT: ; in Loop: Header=BB10_2 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: s_and_b64 s[8:9], exec, s[6:7] +; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_4 +; GCN-NEXT: .LBB10_2: ; %loop_body +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s10, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: s_cbranch_execz .LBB10_1 +; GCN-NEXT: ; %bb.3: ; %loop_end +; GCN-NEXT: ; in Loop: Header=BB10_2 Depth=1 +; GCN-NEXT: s_add_i32 s10, s10, 1 +; GCN-NEXT: s_cmp_eq_u32 s10, 9 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_b64 s[12:13], s[12:13], exec +; GCN-NEXT: v_add_u32_e32 v1, vcc, 4, v1 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] +; GCN-NEXT: s_branch .LBB10_1 +; GCN-NEXT: .LBB10_4: ; %exit +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s14 +; GCN-NEXT: s_mov_b32 s33, s11 +; GCN-NEXT: s_setpc_b64 s[30:31] begin: %local_var = alloca i32, align 1024, addrspace(5) store volatile i32 0, ptr addrspace(5) %local_var, align 1024 @@ -241,16 +410,24 @@ exit: ; preds = %loop_end, %loop_b define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 { ; GCN-LABEL: no_free_scratch_sgpr_for_bp_copy: -; GCN: ; %bb.0: -; GCN: v_writelane_b32 [[VGPR_REG:v[0-9]+]], s34, 0 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 -; GCN: v_readlane_b32 s34, [[VGPR_REG:v[0-9]+]], 0 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN: s_setpc_b64 s[30:31] +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s40, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_mov_b32 s41, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 +; GCN-NEXT: s_addk_i32 s32, 0x6000 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s41 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_mov_b32 s33, s40 +; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 ; Use all clobberable registers, so BP has to spill to a VGPR. @@ -264,15 +441,102 @@ define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 { define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 { ; If there are no free SGPRs or VGPRs available we must spill the BP to memory. - -; GCN-LABEL: no_free_regs_spill_bp_to_mem -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_xor_saveexec_b64 s[6:7], -1 -; GCN: buffer_store_dword v39, off, s[0:3], s33 -; GCN: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] -; GCN: buffer_store_dword v0, off, s[0:3], s33 -; GCN: v_mov_b32_e32 v0, s34 -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s33 +; GCN-LABEL: no_free_regs_spill_bp_to_memory: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_writelane_b32 v39, s4, 32 +; GCN-NEXT: v_writelane_b32 v39, s34, 33 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0x6000 +; GCN-NEXT: v_writelane_b32 v39, s39, 0 +; GCN-NEXT: v_writelane_b32 v39, s48, 1 +; GCN-NEXT: v_writelane_b32 v39, s49, 2 +; GCN-NEXT: v_writelane_b32 v39, s50, 3 +; GCN-NEXT: v_writelane_b32 v39, s51, 4 +; GCN-NEXT: v_writelane_b32 v39, s52, 5 +; GCN-NEXT: v_writelane_b32 v39, s53, 6 +; GCN-NEXT: v_writelane_b32 v39, s54, 7 +; GCN-NEXT: v_writelane_b32 v39, s55, 8 +; GCN-NEXT: v_writelane_b32 v39, s64, 9 +; GCN-NEXT: v_writelane_b32 v39, s65, 10 +; GCN-NEXT: v_writelane_b32 v39, s66, 11 +; GCN-NEXT: v_writelane_b32 v39, s67, 12 +; GCN-NEXT: v_writelane_b32 v39, s68, 13 +; GCN-NEXT: v_writelane_b32 v39, s69, 14 +; GCN-NEXT: v_writelane_b32 v39, s70, 15 +; GCN-NEXT: v_writelane_b32 v39, s71, 16 +; GCN-NEXT: v_writelane_b32 v39, s80, 17 +; GCN-NEXT: v_writelane_b32 v39, s81, 18 +; GCN-NEXT: v_writelane_b32 v39, s82, 19 +; GCN-NEXT: v_writelane_b32 v39, s83, 20 +; GCN-NEXT: v_writelane_b32 v39, s84, 21 +; GCN-NEXT: v_writelane_b32 v39, s85, 22 +; GCN-NEXT: v_writelane_b32 v39, s86, 23 +; GCN-NEXT: v_writelane_b32 v39, s87, 24 +; GCN-NEXT: v_writelane_b32 v39, s96, 25 +; GCN-NEXT: v_writelane_b32 v39, s97, 26 +; GCN-NEXT: v_writelane_b32 v39, s98, 27 +; GCN-NEXT: v_writelane_b32 v39, s99, 28 +; GCN-NEXT: v_writelane_b32 v39, s100, 29 +; GCN-NEXT: v_writelane_b32 v39, s101, 30 +; GCN-NEXT: v_writelane_b32 v39, s102, 31 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: v_readlane_b32 s34, v39, 33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber all VGPRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s102, v39, 31 +; GCN-NEXT: v_readlane_b32 s101, v39, 30 +; GCN-NEXT: v_readlane_b32 s100, v39, 29 +; GCN-NEXT: v_readlane_b32 s99, v39, 28 +; GCN-NEXT: v_readlane_b32 s98, v39, 27 +; GCN-NEXT: v_readlane_b32 s97, v39, 26 +; GCN-NEXT: v_readlane_b32 s96, v39, 25 +; GCN-NEXT: v_readlane_b32 s87, v39, 24 +; GCN-NEXT: v_readlane_b32 s86, v39, 23 +; GCN-NEXT: v_readlane_b32 s85, v39, 22 +; GCN-NEXT: v_readlane_b32 s84, v39, 21 +; GCN-NEXT: v_readlane_b32 s83, v39, 20 +; GCN-NEXT: v_readlane_b32 s82, v39, 19 +; GCN-NEXT: v_readlane_b32 s81, v39, 18 +; GCN-NEXT: v_readlane_b32 s80, v39, 17 +; GCN-NEXT: v_readlane_b32 s71, v39, 16 +; GCN-NEXT: v_readlane_b32 s70, v39, 15 +; GCN-NEXT: v_readlane_b32 s69, v39, 14 +; GCN-NEXT: v_readlane_b32 s68, v39, 13 +; GCN-NEXT: v_readlane_b32 s67, v39, 12 +; GCN-NEXT: v_readlane_b32 s66, v39, 11 +; GCN-NEXT: v_readlane_b32 s65, v39, 10 +; GCN-NEXT: v_readlane_b32 s64, v39, 9 +; GCN-NEXT: v_readlane_b32 s55, v39, 8 +; GCN-NEXT: v_readlane_b32 s54, v39, 7 +; GCN-NEXT: v_readlane_b32 s53, v39, 6 +; GCN-NEXT: v_readlane_b32 s52, v39, 5 +; GCN-NEXT: v_readlane_b32 s51, v39, 4 +; GCN-NEXT: v_readlane_b32 s50, v39, 3 +; GCN-NEXT: v_readlane_b32 s49, v39, 2 +; GCN-NEXT: v_readlane_b32 s48, v39, 1 +; GCN-NEXT: v_readlane_b32 s39, v39, 0 +; GCN-NEXT: v_readlane_b32 s4, v39, 32 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 @@ -299,22 +563,105 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 { define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i32 %b, ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #5 { ; If the size of the offset exceeds the MUBUF offset field we need another ; scratch VGPR to hold the offset. - -; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GCN-NEXT: s_add_i32 s5, s33, 0x42100 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] -; GCN-NEXT: s_add_i32 s5, s33, 0x42200 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NEXT: s_add_i32 s5, s33, 0x42300 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_add_i32 s5, s33, 0x42100 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_writelane_b32 v39, s4, 32 +; GCN-NEXT: v_writelane_b32 v39, s34, 33 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_add_i32 s32, s32, 0x46000 +; GCN-NEXT: v_writelane_b32 v39, s39, 0 +; GCN-NEXT: v_writelane_b32 v39, s48, 1 +; GCN-NEXT: v_writelane_b32 v39, s49, 2 +; GCN-NEXT: v_writelane_b32 v39, s50, 3 +; GCN-NEXT: v_writelane_b32 v39, s51, 4 +; GCN-NEXT: v_writelane_b32 v39, s52, 5 +; GCN-NEXT: v_writelane_b32 v39, s53, 6 +; GCN-NEXT: v_writelane_b32 v39, s54, 7 +; GCN-NEXT: v_writelane_b32 v39, s55, 8 +; GCN-NEXT: v_writelane_b32 v39, s64, 9 +; GCN-NEXT: v_writelane_b32 v39, s65, 10 +; GCN-NEXT: v_writelane_b32 v39, s66, 11 +; GCN-NEXT: v_writelane_b32 v39, s67, 12 +; GCN-NEXT: v_writelane_b32 v39, s68, 13 +; GCN-NEXT: v_writelane_b32 v39, s69, 14 +; GCN-NEXT: v_writelane_b32 v39, s70, 15 +; GCN-NEXT: v_writelane_b32 v39, s71, 16 +; GCN-NEXT: v_writelane_b32 v39, s80, 17 +; GCN-NEXT: v_writelane_b32 v39, s81, 18 +; GCN-NEXT: v_writelane_b32 v39, s82, 19 +; GCN-NEXT: v_writelane_b32 v39, s83, 20 +; GCN-NEXT: v_writelane_b32 v39, s84, 21 +; GCN-NEXT: v_writelane_b32 v39, s85, 22 +; GCN-NEXT: v_writelane_b32 v39, s86, 23 +; GCN-NEXT: v_writelane_b32 v39, s87, 24 +; GCN-NEXT: v_writelane_b32 v39, s96, 25 +; GCN-NEXT: v_writelane_b32 v39, s97, 26 +; GCN-NEXT: v_writelane_b32 v39, s98, 27 +; GCN-NEXT: v_writelane_b32 v39, s99, 28 +; GCN-NEXT: v_writelane_b32 v39, s100, 29 +; GCN-NEXT: v_writelane_b32 v39, s101, 30 +; GCN-NEXT: v_writelane_b32 v39, s102, 31 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 +; GCN-NEXT: v_mov_b32_e32 v1, 0x1080 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: v_readlane_b32 s34, v39, 33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber all VGPRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s102, v39, 31 +; GCN-NEXT: v_readlane_b32 s101, v39, 30 +; GCN-NEXT: v_readlane_b32 s100, v39, 29 +; GCN-NEXT: v_readlane_b32 s99, v39, 28 +; GCN-NEXT: v_readlane_b32 s98, v39, 27 +; GCN-NEXT: v_readlane_b32 s97, v39, 26 +; GCN-NEXT: v_readlane_b32 s96, v39, 25 +; GCN-NEXT: v_readlane_b32 s87, v39, 24 +; GCN-NEXT: v_readlane_b32 s86, v39, 23 +; GCN-NEXT: v_readlane_b32 s85, v39, 22 +; GCN-NEXT: v_readlane_b32 s84, v39, 21 +; GCN-NEXT: v_readlane_b32 s83, v39, 20 +; GCN-NEXT: v_readlane_b32 s82, v39, 19 +; GCN-NEXT: v_readlane_b32 s81, v39, 18 +; GCN-NEXT: v_readlane_b32 s80, v39, 17 +; GCN-NEXT: v_readlane_b32 s71, v39, 16 +; GCN-NEXT: v_readlane_b32 s70, v39, 15 +; GCN-NEXT: v_readlane_b32 s69, v39, 14 +; GCN-NEXT: v_readlane_b32 s68, v39, 13 +; GCN-NEXT: v_readlane_b32 s67, v39, 12 +; GCN-NEXT: v_readlane_b32 s66, v39, 11 +; GCN-NEXT: v_readlane_b32 s65, v39, 10 +; GCN-NEXT: v_readlane_b32 s64, v39, 9 +; GCN-NEXT: v_readlane_b32 s55, v39, 8 +; GCN-NEXT: v_readlane_b32 s54, v39, 7 +; GCN-NEXT: v_readlane_b32 s53, v39, 6 +; GCN-NEXT: v_readlane_b32 s52, v39, 5 +; GCN-NEXT: v_readlane_b32 s51, v39, 4 +; GCN-NEXT: v_readlane_b32 s50, v39, 3 +; GCN-NEXT: v_readlane_b32 s49, v39, 2 +; GCN-NEXT: v_readlane_b32 s48, v39, 1 +; GCN-NEXT: v_readlane_b32 s39, v39, 0 +; GCN-NEXT: v_readlane_b32 s4, v39, 32 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_add_i32 s5, s33, 0x42100 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 80ccd1ffe0294..001c35ef30cc6 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -858,6 +858,7 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6 ; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm @@ -990,10 +991,13 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_co_u32 v10, vcc_lo, v10, v14 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v11, v15, vcc_lo ; GFX12-NEXT: v_sub_co_u32 v8, vcc_lo, v8, v12 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v9, v13, vcc_lo ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll index c0587d260c6f2..75f3b6ff8917a 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll @@ -28,10 +28,10 @@ define void @tail_call_i64_inreg_uniform_in_vgpr_convergence_tokens() #0 { ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[DS_READ_B64_gfx9_]].sub0 ; CHECK-NEXT: CONVERGENCECTRL_GLUE [[CONVERGENCECTRL_ENTRY]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec, implicit [[CONVERGENCECTRL_ENTRY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec, implicit [[CONVERGENCECTRL_ENTRY]] ; CHECK-NEXT: CONVERGENCECTRL_GLUE [[CONVERGENCECTRL_ENTRY]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec, implicit [[CONVERGENCECTRL_ENTRY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec, implicit [[CONVERGENCECTRL_ENTRY]] ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @void_func_i64_inreg, target-flags(amdgpu-gotprel32-hi) @void_func_i64_inreg, implicit-def dead $scc ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]] diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.convergencetokens.ll b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.convergencetokens.ll index ac449f972acb5..a06899eb9b0e6 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.convergencetokens.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.convergencetokens.ll @@ -28,11 +28,11 @@ define void @tail_call_uniform_vgpr_value_convergence_tokens() #0 { ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DS_READ_B64_gfx9_]].sub1 ; CHECK-NEXT: CONVERGENCECTRL_GLUE [[CONVERGENCECTRL_ENTRY]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec, implicit [[CONVERGENCECTRL_ENTRY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec, implicit [[CONVERGENCECTRL_ENTRY]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DS_READ_B64_gfx9_]].sub0 ; CHECK-NEXT: CONVERGENCECTRL_GLUE [[CONVERGENCECTRL_ENTRY]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec, implicit [[CONVERGENCECTRL_ENTRY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec, implicit [[CONVERGENCECTRL_ENTRY]] ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]] ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY7]] diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index cc9fe68ad0dfd..e0d3b5e4064b4 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -32,15 +32,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-LABEL: kernel: ; GLOBALNESS1: ; %bb.0: ; %bb ; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] -; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0 +; GLOBALNESS1-NEXT: s_load_dwordx4 s[52:55], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[76:77] -; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] +; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[52:53] +; GLOBALNESS1-NEXT: s_mov_b64 s[48:49], s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 @@ -49,7 +49,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400 -; GLOBALNESS1-NEXT: s_bitcmp1_b32 s78, 0 +; GLOBALNESS1-NEXT: s_bitcmp1_b32 s54, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] ; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 @@ -59,24 +59,27 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 -; GLOBALNESS1-NEXT: s_mov_b32 s70, s16 ; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] -; GLOBALNESS1-NEXT: s_mov_b32 s71, s15 -; GLOBALNESS1-NEXT: s_mov_b32 s72, s14 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr56 : SGPR spill to VGPR lane +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: v_writelane_b32 v56, s8, 0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 +; GLOBALNESS1-NEXT: v_writelane_b32 v56, s9, 1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[70:71], 1, v3 +; GLOBALNESS1-NEXT: s_mov_b32 s82, s16 +; GLOBALNESS1-NEXT: s_mov_b32 s83, s15 +; GLOBALNESS1-NEXT: s_mov_b32 s84, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr44_vgpr45 @@ -86,26 +89,35 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS1-NEXT: v_writelane_b32 v56, s4, 2 ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GLOBALNESS1-NEXT: v_writelane_b32 v56, s5, 3 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2 +; GLOBALNESS1-NEXT: v_writelane_b32 v56, s4, 4 +; GLOBALNESS1-NEXT: v_writelane_b32 v56, s5, 5 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2 +; GLOBALNESS1-NEXT: v_writelane_b32 v56, s4, 6 +; GLOBALNESS1-NEXT: v_writelane_b32 v56, s5, 7 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1 +; GLOBALNESS1-NEXT: v_writelane_b32 v56, s70, 8 +; GLOBALNESS1-NEXT: v_writelane_b32 v56, s71, 9 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[60:61] +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v56, 6 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v56, 7 +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 -; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_30 ; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 @@ -122,56 +134,62 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 ; GLOBALNESS1-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[46:47] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 -; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[70:71] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 +; GLOBALNESS1-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS1-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lt_i32 s79, 1 +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], -1 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 +; GLOBALNESS1-NEXT: s_cmp_lt_i32 s55, 1 +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 1 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 -; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_8 -; GLOBALNESS1-NEXT: s_branch .LBB1_9 -; GLOBALNESS1-NEXT: .LBB1_7: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s55, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 -; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS1-NEXT: .LBB1_8: ; %LeafBlock +; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS1-NEXT: .LBB1_7: ; %Flow26 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 +; GLOBALNESS1-NEXT: ; %bb.8: ; %LeafBlock +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s55, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], 0 +; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow25 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS1-NEXT: ; %bb.10: ; %baz.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: flat_load_dword v0, v[2:3] ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[86:87], 0, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[74:75], s[62:63] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[52:53], s[86:87] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55] +; GLOBALNESS1-NEXT: v_writelane_b32 v56, s8, 10 +; GLOBALNESS1-NEXT: v_writelane_b32 v56, s9, 11 +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v56, 2 +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v56, 3 +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -185,70 +203,72 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[96:97], 0, v2 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[98:99], 1, v0 ; GLOBALNESS1-NEXT: s_branch .LBB1_16 ; GLOBALNESS1-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[68:69] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[48:49] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[64:65] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[44:45] +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v56, 0 +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v56, 1 +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[98:99] ; GLOBALNESS1-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[80:81] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_add_u32 s68, s38, 40 -; GLOBALNESS1-NEXT: s_addc_u32 s69, s39, 0 +; GLOBALNESS1-NEXT: s_add_u32 s70, s38, 40 +; GLOBALNESS1-NEXT: s_addc_u32 s71, s39, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GLOBALNESS1-NEXT: s_load_dwordx2 s[76:77], s[4:5], 0x0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[70:71] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[70:71] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -256,20 +276,29 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_14 ; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GLOBALNESS1-NEXT: s_branch .LBB1_3 ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow23 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0 +; GLOBALNESS1-NEXT: v_readlane_b32 s70, v56, 8 +; GLOBALNESS1-NEXT: v_readlane_b32 s8, v56, 10 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 +; GLOBALNESS1-NEXT: v_readlane_b32 s71, v56, 9 +; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) +; GLOBALNESS1-NEXT: s_mov_b32 s55, s7 +; GLOBALNESS1-NEXT: v_readlane_b32 s9, v56, 11 ; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[74:75] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] +; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[52:53] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[86:87] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59] +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v56, 4 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v56, 5 +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -284,18 +313,18 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_32 ; GLOBALNESS1-NEXT: ; %bb.31: ; %bb7.i.i ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 @@ -308,12 +337,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; %bb.33: ; %bb11.i.i ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 @@ -324,15 +353,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-LABEL: kernel: ; GLOBALNESS0: ; %bb.0: ; %bb ; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] -; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[8:9], 0x0 +; GLOBALNESS0-NEXT: s_load_dwordx4 s[52:55], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[72:73] -; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] +; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[52:53] +; GLOBALNESS0-NEXT: s_mov_b64 s[48:49], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 @@ -341,7 +370,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400 -; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0 +; GLOBALNESS0-NEXT: s_bitcmp1_b32 s54, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] ; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 @@ -351,24 +380,27 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 -; GLOBALNESS0-NEXT: s_mov_b32 s68, s16 ; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] -; GLOBALNESS0-NEXT: s_mov_b32 s69, s15 -; GLOBALNESS0-NEXT: s_mov_b32 s70, s14 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr56 : SGPR spill to VGPR lane +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: v_writelane_b32 v56, s8, 0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 +; GLOBALNESS0-NEXT: v_writelane_b32 v56, s9, 1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[84:85], 1, v3 +; GLOBALNESS0-NEXT: s_mov_b32 s70, s16 +; GLOBALNESS0-NEXT: s_mov_b32 s71, s15 +; GLOBALNESS0-NEXT: s_mov_b32 s82, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr44_vgpr45 @@ -378,26 +410,35 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS0-NEXT: v_writelane_b32 v56, s4, 2 ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GLOBALNESS0-NEXT: v_writelane_b32 v56, s5, 3 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2 +; GLOBALNESS0-NEXT: v_writelane_b32 v56, s4, 4 +; GLOBALNESS0-NEXT: v_writelane_b32 v56, s5, 5 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2 +; GLOBALNESS0-NEXT: v_writelane_b32 v56, s4, 6 +; GLOBALNESS0-NEXT: v_writelane_b32 v56, s5, 7 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1 +; GLOBALNESS0-NEXT: v_writelane_b32 v56, s84, 8 +; GLOBALNESS0-NEXT: v_writelane_b32 v56, s85, 9 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[60:61] +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v56, 6 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v56, 7 +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 -; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_30 ; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 @@ -414,56 +455,63 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 ; GLOBALNESS0-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[46:47] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 -; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[84:85] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 +; GLOBALNESS0-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS0-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lt_i32 s75, 1 +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], -1 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 +; GLOBALNESS0-NEXT: s_cmp_lt_i32 s55, 1 +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 1 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 -; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_8 -; GLOBALNESS0-NEXT: s_branch .LBB1_9 -; GLOBALNESS0-NEXT: .LBB1_7: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s55, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 -; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS0-NEXT: .LBB1_8: ; %LeafBlock +; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS0-NEXT: .LBB1_7: ; %Flow26 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 +; GLOBALNESS0-NEXT: ; %bb.8: ; %LeafBlock +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s55, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], 0 +; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow25 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS0-NEXT: ; %bb.10: ; %baz.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[2:3] ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[86:87], 0, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[76:77], s[62:63] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[52:53], s[86:87] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] +; GLOBALNESS0-NEXT: v_writelane_b32 v56, s8, 10 +; GLOBALNESS0-NEXT: v_writelane_b32 v56, s9, 11 +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v56, 2 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v56, 3 +; GLOBALNESS0-NEXT: s_mov_b32 s83, s55 +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -477,70 +525,72 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[96:97], 0, v2 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[98:99], 1, v0 ; GLOBALNESS0-NEXT: s_branch .LBB1_16 ; GLOBALNESS0-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[68:69] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[48:49] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[64:65] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[44:45] +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v56, 0 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v56, 1 +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[98:99] ; GLOBALNESS0-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[80:81] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_add_u32 s72, s38, 40 -; GLOBALNESS0-NEXT: s_addc_u32 s73, s39, 0 +; GLOBALNESS0-NEXT: s_add_u32 s84, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s85, s39, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GLOBALNESS0-NEXT: s_load_dwordx2 s[78:79], s[4:5], 0x0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[84:85] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[84:85] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -548,20 +598,27 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_14 ; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GLOBALNESS0-NEXT: s_branch .LBB1_3 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow23 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: v_readlane_b32 s84, v56, 8 +; GLOBALNESS0-NEXT: v_readlane_b32 s8, v56, 10 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 +; GLOBALNESS0-NEXT: s_mov_b32 s55, s83 +; GLOBALNESS0-NEXT: v_readlane_b32 s85, v56, 9 +; GLOBALNESS0-NEXT: v_readlane_b32 s9, v56, 11 ; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[76:77] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] +; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[52:53] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[86:87] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59] +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v56, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v56, 5 +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -576,18 +633,18 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_32 ; GLOBALNESS0-NEXT: ; %bb.31: ; %bb7.i.i ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 @@ -600,12 +657,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; %bb.33: ; %bb11.i.i ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll index d9e0e0298e072..ef2eca8cd3491 100644 --- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll @@ -13,11 +13,11 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:sreg_64 = COPY killed [[COPY1]] ; CHECK-NEXT: early-clobber %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[COPY2]], 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %11.sub0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY %11.sub0 ; CHECK-NEXT: $sgpr0 = COPY killed [[COPY3]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY killed %11.sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0 = COPY killed %11.sub1 ; CHECK-NEXT: $sgpr1 = COPY killed [[COPY4]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0 = COPY killed [[S_LOAD_DWORD_IMM]] ; CHECK-NEXT: $sgpr2 = COPY killed [[COPY5]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit killed $sgpr0, implicit killed $sgpr1, implicit killed $sgpr2 %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir index b80c478c3761f..0df2e651a15e1 100644 --- a/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir @@ -27,11 +27,11 @@ body: | ; CHECK-NEXT: renamable $sgpr4 = COPY $sgpr0 ; CHECK-NEXT: SI_SPILL_S128_SAVE $sgpr0_sgpr1_sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr5 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr76 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr77 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr78 = COPY renamable $sgpr5 + ; CHECK-NEXT: renamable $sgpr36 = COPY renamable $sgpr5 + ; CHECK-NEXT: renamable $sgpr37 = COPY renamable $sgpr5 + ; CHECK-NEXT: renamable $sgpr38 = COPY renamable $sgpr5 ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1056964608 - ; CHECK-NEXT: renamable $sgpr79 = COPY renamable $sgpr5 + ; CHECK-NEXT: renamable $sgpr39 = COPY renamable $sgpr5 ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0 ; CHECK-NEXT: renamable $sgpr8 = COPY renamable $sgpr5 ; CHECK-NEXT: renamable $sgpr9 = COPY renamable $sgpr5 @@ -43,46 +43,46 @@ body: | ; CHECK-NEXT: renamable $sgpr15 = COPY renamable $sgpr5 ; CHECK-NEXT: renamable $vgpr5_vgpr6 = COPY killed renamable $sgpr0_sgpr1 ; CHECK-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1088, 0 :: (dereferenceable load (s256), addrspace 6) - ; CHECK-NEXT: renamable $sgpr80_sgpr81_sgpr82_sgpr83 = S_LOAD_DWORDX4_IMM renamable $sgpr4_sgpr5, 0, 0 :: (load (s128), addrspace 6) + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67 = S_LOAD_DWORDX4_IMM renamable $sgpr4_sgpr5, 0, 0 :: (load (s128), addrspace 6) ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1200 ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1152, 0 :: (dereferenceable load (s256), addrspace 6) - ; CHECK-NEXT: renamable $sgpr84_sgpr85_sgpr86_sgpr87 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) + ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1152, 0 :: (dereferenceable load (s256), addrspace 6) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) ; CHECK-NEXT: KILL killed renamable $sgpr0, renamable $sgpr1 ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1264 ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1216, 0 :: (dereferenceable load (s256), addrspace 6) - ; CHECK-NEXT: renamable $sgpr88_sgpr89_sgpr90_sgpr91 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) + ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1216, 0 :: (dereferenceable load (s256), addrspace 6) + ; CHECK-NEXT: renamable $sgpr80_sgpr81_sgpr82_sgpr83 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1328 ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1280, 0 :: (dereferenceable load (s256), addrspace 6) - ; CHECK-NEXT: renamable $sgpr92_sgpr93_sgpr94_sgpr95 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) - ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1344, 0 :: (dereferenceable load (s256), addrspace 6) + ; CHECK-NEXT: renamable $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1280, 0 :: (dereferenceable load (s256), addrspace 6) + ; CHECK-NEXT: renamable $sgpr84_sgpr85_sgpr86_sgpr87 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) + ; CHECK-NEXT: renamable $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1344, 0 :: (dereferenceable load (s256), addrspace 6) ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 1392 ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 0, 0 :: (load (s256), addrspace 6) + ; CHECK-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 0, 0 :: (load (s256), addrspace 6) ; CHECK-NEXT: renamable $sgpr2 = S_MOV_B32 1456 ; CHECK-NEXT: renamable $sgpr3 = COPY renamable $sgpr5 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1472, 0 :: (dereferenceable load (s256), addrspace 6) + ; CHECK-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1472, 0 :: (dereferenceable load (s256), addrspace 6) ; CHECK-NEXT: renamable $sgpr4 = S_MOV_B32 1520 ; CHECK-NEXT: renamable $sgpr96_sgpr97_sgpr98_sgpr99 = S_LOAD_DWORDX4_IMM killed renamable $sgpr2_sgpr3, 0, 0 :: (load (s128), addrspace 6) ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (load (s128), addrspace 6) ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s128), addrspace 6) - ; CHECK-NEXT: renamable $vgpr7 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, killed renamable $sgpr76_sgpr77_sgpr78_sgpr79, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr8 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, killed renamable $sgpr80_sgpr81_sgpr82_sgpr83, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr9 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, killed renamable $sgpr88_sgpr89_sgpr90_sgpr91, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr11 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, killed renamable $sgpr92_sgpr93_sgpr94_sgpr95, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr12 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr13 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr7 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, killed renamable $sgpr36_sgpr37_sgpr38_sgpr39, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr8 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, killed renamable $sgpr64_sgpr65_sgpr66_sgpr67, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr9 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47, killed renamable $sgpr68_sgpr69_sgpr70_sgpr71, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, killed renamable $sgpr80_sgpr81_sgpr82_sgpr83, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr11 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr12 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr13 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) + ; CHECK-NEXT: renamable $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) ; CHECK-NEXT: renamable $sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr1_vgpr2_vgpr3_vgpr4 = BUFFER_LOAD_FORMAT_XYZW_IDXEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) ; CHECK-NEXT: KILL killed renamable $sgpr4_sgpr5_sgpr6_sgpr7 - ; CHECK-NEXT: KILL killed renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 + ; CHECK-NEXT: KILL killed renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 ; CHECK-NEXT: KILL killed renamable $vgpr5_vgpr6 ; CHECK-NEXT: KILL killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-NEXT: KILL killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 + ; CHECK-NEXT: KILL killed renamable $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 ; CHECK-NEXT: KILL killed renamable $sgpr8_sgpr9_sgpr10_sgpr11 ; CHECK-NEXT: KILL killed renamable $vgpr0 ; CHECK-NEXT: renamable $vgpr0 = nofpexcept V_MAX_F32_e32 killed $vgpr7, killed $vgpr8, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll new file mode 100644 index 0000000000000..b2df807bcb08c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll @@ -0,0 +1,195 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s + +declare ptr @G() + +define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x i32> %vec) { +; CHECK-LABEL: foo: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: v_pk_mov_b32 v[44:45], 0, 0 +; CHECK-NEXT: flat_load_dword a32, v[44:45] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x8 +; CHECK-NEXT: s_load_dword s64, s[8:9], 0x0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v46, s6 +; CHECK-NEXT: v_mov_b32_e32 v47, s7 +; CHECK-NEXT: s_mov_b64 s[6:7], src_private_base +; CHECK-NEXT: s_cmp_lg_u32 s64, -1 +; CHECK-NEXT: s_cselect_b32 s7, s7, 0 +; CHECK-NEXT: s_cselect_b32 s8, s64, 0 +; CHECK-NEXT: s_add_u32 s50, s34, 48 +; CHECK-NEXT: s_addc_u32 s51, s35, 0 +; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[4:5], s[4:5] op_sel:[0,1] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, G@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, G@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: v_mov_b32_e32 v57, s7 +; CHECK-NEXT: s_mov_b32 s7, s6 +; CHECK-NEXT: s_mov_b32 s53, s14 +; CHECK-NEXT: v_mov_b32_e32 v56, s8 +; CHECK-NEXT: v_pk_mov_b32 v[60:61], s[6:7], s[6:7] op_sel:[0,1] +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51] +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s14, s16 +; CHECK-NEXT: v_mov_b32_e32 v31, v0 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_mov_b32 s33, s16 +; CHECK-NEXT: s_mov_b32 s52, s15 +; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] +; CHECK-NEXT: v_mov_b32_e32 v40, v0 +; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61] +; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] +; CHECK-NEXT: flat_load_dwordx2 v[62:63], v[58:59] +; CHECK-NEXT: v_mov_b32_e32 v42, 0 +; CHECK-NEXT: v_mov_b32_e32 v43, 0x3ff00000 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: v_mov_b32_e32 v31, v40 +; CHECK-NEXT: flat_store_dwordx2 v[44:45], v[42:43] +; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] +; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[56:57] glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_accvgpr_read_b32 v6, a32 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s64 +; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 +; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[46:47] +; CHECK-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; CHECK-NEXT: ; implicit-def: $vgpr4 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; CHECK-NEXT: s_cbranch_execz .LBB0_4 +; CHECK-NEXT: ; %bb.1: ; %LeafBlock5 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: ; %bb.2: ; %sw.bb17.i.i.i.i +; CHECK-NEXT: v_mov_b32_e32 v4, 1 +; CHECK-NEXT: ; %bb.3: ; %Flow +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB0_4: ; %Flow8 +; CHECK-NEXT: s_or_saveexec_b64 s[8:9], s[4:5] +; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; CHECK-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] +; CHECK-NEXT: s_xor_b64 exec, exec, s[8:9] +; CHECK-NEXT: s_cbranch_execz .LBB0_8 +; CHECK-NEXT: ; %bb.5: ; %LeafBlock +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; CHECK-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] +; CHECK-NEXT: s_and_saveexec_b64 s[10:11], vcc +; CHECK-NEXT: ; %bb.6: ; %sw.bb.i.i.i.i +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: s_mov_b32 s6, s4 +; CHECK-NEXT: s_mov_b32 s7, s4 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; CHECK-NEXT: ; %bb.7: ; %Flow7 +; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: .LBB0_8: ; %bb.1 +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB0_10 +; CHECK-NEXT: ; %bb.9: ; %sw.bb.i.i.i.i.i +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[34:35], 0x20 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1] +; CHECK-NEXT: .LBB0_10: ; %bb.2 +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm +entry: + %load.null = load i32, ptr null, align 8 + %insert = insertelement <4 x i32> zeroinitializer, i32 %load.null, i64 0 + %cast = addrspacecast ptr addrspace(5) %ptr5 to ptr + store double 0.000000e+00, ptr %p0, align 8 + %call = tail call ptr @G() + store double 1.000000e+00, ptr null, align 8 + %load.0 = load double, ptr %p0, align 8 + store volatile double 0.000000e+00, ptr %p0, align 8 + %call.1 = tail call ptr @G() + %load.1 = load volatile double, ptr %cast, align 8 + store volatile double %load.0, ptr %p0, align 8 + store double %v0, ptr %p0, align 8 + %load.2 = load double, ptr %p0, align 8 + store double %load.2, ptr addrspace(5) %ptr5, align 8 + store i32 0, ptr addrspace(5) %ptr5, align 4 + switch i32 %load.null, label %bb.1 [ + i32 0, label %sw.bb.i.i.i.i + i32 1, label %sw.bb17.i.i.i.i + ] + +sw.bb.i.i.i.i: ; preds = %entry + br label %bb.1 + +sw.bb17.i.i.i.i: ; preds = %entry + br label %bb.1 + +bb.1: ; preds = %sw.bb17.i.i.i.i, %sw.bb.i.i.i.i, %entry + %phi.0 = phi i32 [ 0, %entry ], [ 0, %sw.bb.i.i.i.i ], [ 1, %sw.bb17.i.i.i.i ] + %phi.1 = phi <4 x i32> [ %insert, %entry ], [ zeroinitializer, %sw.bb.i.i.i.i ], [ %insert, %sw.bb17.i.i.i.i ] + switch i32 %phi.0, label %bb.2 [ + i32 0, label %sw.bb.i.i.i.i.i + ] + +sw.bb.i.i.i.i.i: ; preds = %bb.1 + br label %bb.2 + +bb.2: ; preds = %sw.bb.i.i.i.i.i, %bb.1 + %phi.2 = phi <4 x i32> [ %phi.1, %bb.1 ], [ %vec, %sw.bb.i.i.i.i.i ] + %extract.1 = extractelement <4 x i32> %phi.2, i64 0 + switch i32 1, label %bb.3 [ + i32 0, label %sw.bb.i.i5.i.i + ] + +sw.bb.i.i5.i.i: ; preds = %bb.2 + br label %bb.3 + +bb.3: ; preds = %sw.bb.i.i5.i.i, %bb.2 + %phi.3 = phi <4 x i32> [ zeroinitializer, %sw.bb.i.i5.i.i ], [ %insert, %bb.2 ] + switch i32 %extract.1, label %bb.4 [ + i32 0, label %sw.bb7.i.i.i3.i.i + ] + +sw.bb7.i.i.i3.i.i: ; preds = %bb.3 + %insert.0 = insertelement <4 x i32> %insert, i32 0, i64 1 + br label %bb.4 + +bb.4: ; preds = %sw.bb7.i.i.i3.i.i, %bb.3 + %phi.4 = phi <4 x i32> [ %phi.3, %bb.3 ], [ %insert.0, %sw.bb7.i.i.i3.i.i ] + %extract = extractelement <4 x i32> %phi.4, i64 0 + store i32 %extract, ptr addrspace(5) null, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll index da0234c90363d..06535c5e0c5f2 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll @@ -87,7 +87,7 @@ define internal void @internal2() { define amdgpu_kernel void @kernel2() #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel2 -; CHECK-SAME: () #[[ATTR4:[0-9]+]] { +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: call void @internal2() ; CHECK-NEXT: ret void ; @@ -101,5 +101,4 @@ attributes #0 = { "uniform-work-group-size"="true" } ; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR2]] = { "uniform-work-group-size"="true" } ; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll index dc19f4d879e86..f292c7cf318ca 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll @@ -103,5 +103,5 @@ attributes #1 = { "uniform-work-group-size"="true" } ;. ; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index 433fcd3c324df..04486d0c91563 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -20,14 +20,14 @@ define hidden void @widget() { ; GCN-NEXT: v_writelane_b32 v41, s37, 3 ; GCN-NEXT: v_writelane_b32 v41, s38, 4 ; GCN-NEXT: v_writelane_b32 v41, s39, 5 -; GCN-NEXT: v_writelane_b32 v41, s40, 6 -; GCN-NEXT: v_writelane_b32 v41, s41, 7 -; GCN-NEXT: v_writelane_b32 v41, s42, 8 -; GCN-NEXT: v_writelane_b32 v41, s43, 9 -; GCN-NEXT: v_writelane_b32 v41, s44, 10 -; GCN-NEXT: v_writelane_b32 v41, s45, 11 -; GCN-NEXT: v_writelane_b32 v41, s46, 12 -; GCN-NEXT: v_writelane_b32 v41, s47, 13 +; GCN-NEXT: v_writelane_b32 v41, s48, 6 +; GCN-NEXT: v_writelane_b32 v41, s49, 7 +; GCN-NEXT: v_writelane_b32 v41, s50, 8 +; GCN-NEXT: v_writelane_b32 v41, s51, 9 +; GCN-NEXT: v_writelane_b32 v41, s52, 10 +; GCN-NEXT: v_writelane_b32 v41, s53, 11 +; GCN-NEXT: v_writelane_b32 v41, s54, 12 +; GCN-NEXT: v_writelane_b32 v41, s55, 13 ; GCN-NEXT: v_writelane_b32 v41, s30, 14 ; GCN-NEXT: v_writelane_b32 v41, s31, 15 ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -37,7 +37,7 @@ define hidden void @widget() { ; GCN-NEXT: s_mov_b64 s[16:17], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0 -; GCN-NEXT: s_mov_b64 s[46:47], 0 +; GCN-NEXT: s_mov_b64 s[54:55], 0 ; GCN-NEXT: s_mov_b64 s[18:19], 0 ; GCN-NEXT: s_cbranch_vccz .LBB0_9 ; GCN-NEXT: ; %bb.1: ; %Flow @@ -53,29 +53,29 @@ define hidden void @widget() { ; GCN-NEXT: s_mov_b64 s[34:35], s[4:5] ; GCN-NEXT: s_mov_b64 s[36:37], s[6:7] ; GCN-NEXT: s_mov_b64 s[38:39], s[8:9] -; GCN-NEXT: s_mov_b64 s[40:41], s[10:11] -; GCN-NEXT: s_mov_b32 s42, s12 -; GCN-NEXT: s_mov_b32 s43, s13 -; GCN-NEXT: s_mov_b32 s44, s14 -; GCN-NEXT: s_mov_b32 s45, s15 +; GCN-NEXT: s_mov_b64 s[48:49], s[10:11] +; GCN-NEXT: s_mov_b32 s50, s12 +; GCN-NEXT: s_mov_b32 s51, s13 +; GCN-NEXT: s_mov_b32 s52, s14 +; GCN-NEXT: s_mov_b32 s53, s15 ; GCN-NEXT: v_mov_b32_e32 v40, v31 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_mov_b32_e32 v31, v40 -; GCN-NEXT: s_mov_b32 s12, s42 -; GCN-NEXT: s_mov_b32 s13, s43 -; GCN-NEXT: s_mov_b32 s14, s44 -; GCN-NEXT: s_mov_b32 s15, s45 +; GCN-NEXT: s_mov_b32 s12, s50 +; GCN-NEXT: s_mov_b32 s13, s51 +; GCN-NEXT: s_mov_b32 s14, s52 +; GCN-NEXT: s_mov_b32 s15, s53 ; GCN-NEXT: s_mov_b64 s[4:5], s[34:35] ; GCN-NEXT: s_mov_b64 s[6:7], s[36:37] ; GCN-NEXT: s_mov_b64 s[8:9], s[38:39] -; GCN-NEXT: s_mov_b64 s[10:11], s[40:41] +; GCN-NEXT: s_mov_b64 s[10:11], s[48:49] ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; GCN-NEXT: s_mov_b64 s[16:17], 0 -; GCN-NEXT: s_andn2_b64 s[18:19], s[46:47], exec +; GCN-NEXT: s_andn2_b64 s[18:19], s[54:55], exec ; GCN-NEXT: s_and_b64 s[20:21], vcc, exec -; GCN-NEXT: s_or_b64 s[46:47], s[18:19], s[20:21] +; GCN-NEXT: s_or_b64 s[54:55], s[18:19], s[20:21] ; GCN-NEXT: .LBB0_4: ; %Flow2 -; GCN-NEXT: s_and_saveexec_b64 s[18:19], s[46:47] +; GCN-NEXT: s_and_saveexec_b64 s[18:19], s[54:55] ; GCN-NEXT: s_xor_b64 s[18:19], exec, s[18:19] ; GCN-NEXT: s_cbranch_execz .LBB0_6 ; GCN-NEXT: ; %bb.5: ; %bb12 @@ -95,14 +95,14 @@ define hidden void @widget() { ; GCN-NEXT: .LBB0_8: ; %UnifiedReturnBlock ; GCN-NEXT: v_readlane_b32 s30, v41, 14 ; GCN-NEXT: v_readlane_b32 s31, v41, 15 -; GCN-NEXT: v_readlane_b32 s47, v41, 13 -; GCN-NEXT: v_readlane_b32 s46, v41, 12 -; GCN-NEXT: v_readlane_b32 s45, v41, 11 -; GCN-NEXT: v_readlane_b32 s44, v41, 10 -; GCN-NEXT: v_readlane_b32 s43, v41, 9 -; GCN-NEXT: v_readlane_b32 s42, v41, 8 -; GCN-NEXT: v_readlane_b32 s41, v41, 7 -; GCN-NEXT: v_readlane_b32 s40, v41, 6 +; GCN-NEXT: v_readlane_b32 s55, v41, 13 +; GCN-NEXT: v_readlane_b32 s54, v41, 12 +; GCN-NEXT: v_readlane_b32 s53, v41, 11 +; GCN-NEXT: v_readlane_b32 s52, v41, 10 +; GCN-NEXT: v_readlane_b32 s51, v41, 9 +; GCN-NEXT: v_readlane_b32 s50, v41, 8 +; GCN-NEXT: v_readlane_b32 s49, v41, 7 +; GCN-NEXT: v_readlane_b32 s48, v41, 6 ; GCN-NEXT: v_readlane_b32 s39, v41, 5 ; GCN-NEXT: v_readlane_b32 s38, v41, 4 ; GCN-NEXT: v_readlane_b32 s37, v41, 3 @@ -119,7 +119,7 @@ define hidden void @widget() { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; GCN-NEXT: .LBB0_9: ; %bb2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[46:47], 21, v0 +; GCN-NEXT: v_cmp_eq_u32_e64 s[54:55], 21, v0 ; GCN-NEXT: v_cmp_ne_u32_e64 s[18:19], 21, v0 ; GCN-NEXT: s_mov_b64 vcc, exec ; GCN-NEXT: s_cbranch_execnz .LBB0_2 @@ -272,53 +272,53 @@ define hidden void @blam() { ; GCN-NEXT: v_writelane_b32 v45, s37, 3 ; GCN-NEXT: v_writelane_b32 v45, s38, 4 ; GCN-NEXT: v_writelane_b32 v45, s39, 5 -; GCN-NEXT: v_writelane_b32 v45, s40, 6 -; GCN-NEXT: v_writelane_b32 v45, s41, 7 -; GCN-NEXT: v_writelane_b32 v45, s42, 8 -; GCN-NEXT: v_writelane_b32 v45, s43, 9 -; GCN-NEXT: v_writelane_b32 v45, s44, 10 -; GCN-NEXT: v_writelane_b32 v45, s45, 11 -; GCN-NEXT: v_writelane_b32 v45, s46, 12 -; GCN-NEXT: v_writelane_b32 v45, s47, 13 -; GCN-NEXT: v_writelane_b32 v45, s48, 14 -; GCN-NEXT: v_writelane_b32 v45, s49, 15 -; GCN-NEXT: v_writelane_b32 v45, s50, 16 -; GCN-NEXT: v_writelane_b32 v45, s51, 17 -; GCN-NEXT: v_writelane_b32 v45, s52, 18 -; GCN-NEXT: v_writelane_b32 v45, s53, 19 -; GCN-NEXT: v_writelane_b32 v45, s54, 20 -; GCN-NEXT: v_writelane_b32 v45, s55, 21 -; GCN-NEXT: v_writelane_b32 v45, s56, 22 -; GCN-NEXT: v_writelane_b32 v45, s57, 23 +; GCN-NEXT: v_writelane_b32 v45, s48, 6 +; GCN-NEXT: v_writelane_b32 v45, s49, 7 +; GCN-NEXT: v_writelane_b32 v45, s50, 8 +; GCN-NEXT: v_writelane_b32 v45, s51, 9 +; GCN-NEXT: v_writelane_b32 v45, s52, 10 +; GCN-NEXT: v_writelane_b32 v45, s53, 11 +; GCN-NEXT: v_writelane_b32 v45, s54, 12 +; GCN-NEXT: v_writelane_b32 v45, s55, 13 +; GCN-NEXT: v_writelane_b32 v45, s64, 14 +; GCN-NEXT: v_writelane_b32 v45, s65, 15 +; GCN-NEXT: v_writelane_b32 v45, s66, 16 +; GCN-NEXT: v_writelane_b32 v45, s67, 17 +; GCN-NEXT: v_writelane_b32 v45, s68, 18 +; GCN-NEXT: v_writelane_b32 v45, s69, 19 +; GCN-NEXT: v_writelane_b32 v45, s70, 20 +; GCN-NEXT: v_writelane_b32 v45, s71, 21 +; GCN-NEXT: v_writelane_b32 v45, s80, 22 +; GCN-NEXT: v_writelane_b32 v45, s81, 23 ; GCN-NEXT: v_writelane_b32 v45, s30, 24 ; GCN-NEXT: v_writelane_b32 v45, s31, 25 ; GCN-NEXT: v_mov_b32_e32 v40, v31 -; GCN-NEXT: s_mov_b32 s46, s15 -; GCN-NEXT: s_mov_b32 s47, s14 -; GCN-NEXT: s_mov_b32 s48, s13 -; GCN-NEXT: s_mov_b32 s49, s12 +; GCN-NEXT: s_mov_b32 s54, s15 +; GCN-NEXT: s_mov_b32 s55, s14 +; GCN-NEXT: s_mov_b32 s64, s13 +; GCN-NEXT: s_mov_b32 s65, s12 ; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] -; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] +; GCN-NEXT: s_mov_b64 s[48:49], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40 ; GCN-NEXT: flat_load_dword v43, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: s_mov_b64 s[50:51], 0 +; GCN-NEXT: s_mov_b64 s[66:67], 0 ; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[52:53], 0, v43 -; GCN-NEXT: v_cmp_neq_f32_e64 s[42:43], 0, v43 +; GCN-NEXT: v_cmp_eq_f32_e64 s[68:69], 0, v43 +; GCN-NEXT: v_cmp_neq_f32_e64 s[50:51], 0, v43 ; GCN-NEXT: v_mov_b32_e32 v44, 0x7fc00000 ; GCN-NEXT: s_branch .LBB1_2 ; GCN-NEXT: .LBB1_1: ; %Flow7 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_and_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_or_b64 s[50:51], s[4:5], s[50:51] -; GCN-NEXT: s_andn2_b64 exec, exec, s[50:51] +; GCN-NEXT: s_or_b64 s[66:67], s[4:5], s[66:67] +; GCN-NEXT: s_andn2_b64 exec, exec, s[66:67] ; GCN-NEXT: s_cbranch_execz .LBB1_18 ; GCN-NEXT: .LBB1_2: ; %bb2 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -329,26 +329,26 @@ define hidden void @blam() { ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 2, v0 ; GCN-NEXT: s_mov_b64 s[4:5], -1 ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_xor_b64 s[54:55], exec, s[8:9] +; GCN-NEXT: s_xor_b64 s[70:71], exec, s[8:9] ; GCN-NEXT: s_cbranch_execz .LBB1_12 ; GCN-NEXT: ; %bb.3: ; %bb6 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: v_cmp_eq_u32_e64 s[44:45], 3, v0 -; GCN-NEXT: s_and_saveexec_b64 s[56:57], s[44:45] +; GCN-NEXT: v_cmp_eq_u32_e64 s[52:53], 3, v0 +; GCN-NEXT: s_and_saveexec_b64 s[80:81], s[52:53] ; GCN-NEXT: s_cbranch_execz .LBB1_11 ; GCN-NEXT: ; %bb.4: ; %bb11 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, spam@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, spam@rel32@hi+12 -; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_mov_b64 s[4:5], s[48:49] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] ; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] -; GCN-NEXT: s_mov_b32 s12, s49 -; GCN-NEXT: s_mov_b32 s13, s48 -; GCN-NEXT: s_mov_b32 s14, s47 -; GCN-NEXT: s_mov_b32 s15, s46 +; GCN-NEXT: s_mov_b32 s12, s65 +; GCN-NEXT: s_mov_b32 s13, s64 +; GCN-NEXT: s_mov_b32 s14, s55 +; GCN-NEXT: s_mov_b32 s15, s54 ; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 @@ -357,13 +357,13 @@ define hidden void @blam() { ; GCN-NEXT: s_cbranch_execz .LBB1_10 ; GCN-NEXT: ; %bb.5: ; %bb14 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_mov_b64 s[8:9], s[52:53] -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[42:43] +; GCN-NEXT: s_mov_b64 s[8:9], s[68:69] +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[50:51] ; GCN-NEXT: s_cbranch_execz .LBB1_7 ; GCN-NEXT: ; %bb.6: ; %bb16 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 -; GCN-NEXT: s_or_b64 s[8:9], s[52:53], exec +; GCN-NEXT: s_or_b64 s[8:9], s[68:69], exec ; GCN-NEXT: .LBB1_7: ; %Flow3 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] @@ -382,19 +382,19 @@ define hidden void @blam() { ; GCN-NEXT: .LBB1_10: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_andn2_b64 s[4:5], s[44:45], exec +; GCN-NEXT: s_andn2_b64 s[4:5], s[52:53], exec ; GCN-NEXT: s_and_b64 s[8:9], vcc, exec -; GCN-NEXT: s_or_b64 s[44:45], s[4:5], s[8:9] +; GCN-NEXT: s_or_b64 s[52:53], s[4:5], s[8:9] ; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GCN-NEXT: .LBB1_11: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[56:57] -; GCN-NEXT: s_orn2_b64 s[4:5], s[44:45], exec +; GCN-NEXT: s_or_b64 exec, exec, s[80:81] +; GCN-NEXT: s_orn2_b64 s[4:5], s[52:53], exec ; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB1_12: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[54:55] +; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[70:71] ; GCN-NEXT: s_cbranch_execz .LBB1_16 ; GCN-NEXT: ; %bb.13: ; %bb8 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 @@ -426,27 +426,27 @@ define hidden void @blam() { ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_branch .LBB1_1 ; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock -; GCN-NEXT: s_or_b64 exec, exec, s[50:51] +; GCN-NEXT: s_or_b64 exec, exec, s[66:67] ; GCN-NEXT: v_readlane_b32 s30, v45, 24 ; GCN-NEXT: v_readlane_b32 s31, v45, 25 -; GCN-NEXT: v_readlane_b32 s57, v45, 23 -; GCN-NEXT: v_readlane_b32 s56, v45, 22 -; GCN-NEXT: v_readlane_b32 s55, v45, 21 -; GCN-NEXT: v_readlane_b32 s54, v45, 20 -; GCN-NEXT: v_readlane_b32 s53, v45, 19 -; GCN-NEXT: v_readlane_b32 s52, v45, 18 -; GCN-NEXT: v_readlane_b32 s51, v45, 17 -; GCN-NEXT: v_readlane_b32 s50, v45, 16 -; GCN-NEXT: v_readlane_b32 s49, v45, 15 -; GCN-NEXT: v_readlane_b32 s48, v45, 14 -; GCN-NEXT: v_readlane_b32 s47, v45, 13 -; GCN-NEXT: v_readlane_b32 s46, v45, 12 -; GCN-NEXT: v_readlane_b32 s45, v45, 11 -; GCN-NEXT: v_readlane_b32 s44, v45, 10 -; GCN-NEXT: v_readlane_b32 s43, v45, 9 -; GCN-NEXT: v_readlane_b32 s42, v45, 8 -; GCN-NEXT: v_readlane_b32 s41, v45, 7 -; GCN-NEXT: v_readlane_b32 s40, v45, 6 +; GCN-NEXT: v_readlane_b32 s81, v45, 23 +; GCN-NEXT: v_readlane_b32 s80, v45, 22 +; GCN-NEXT: v_readlane_b32 s71, v45, 21 +; GCN-NEXT: v_readlane_b32 s70, v45, 20 +; GCN-NEXT: v_readlane_b32 s69, v45, 19 +; GCN-NEXT: v_readlane_b32 s68, v45, 18 +; GCN-NEXT: v_readlane_b32 s67, v45, 17 +; GCN-NEXT: v_readlane_b32 s66, v45, 16 +; GCN-NEXT: v_readlane_b32 s65, v45, 15 +; GCN-NEXT: v_readlane_b32 s64, v45, 14 +; GCN-NEXT: v_readlane_b32 s55, v45, 13 +; GCN-NEXT: v_readlane_b32 s54, v45, 12 +; GCN-NEXT: v_readlane_b32 s53, v45, 11 +; GCN-NEXT: v_readlane_b32 s52, v45, 10 +; GCN-NEXT: v_readlane_b32 s51, v45, 9 +; GCN-NEXT: v_readlane_b32 s50, v45, 8 +; GCN-NEXT: v_readlane_b32 s49, v45, 7 +; GCN-NEXT: v_readlane_b32 s48, v45, 6 ; GCN-NEXT: v_readlane_b32 s39, v45, 5 ; GCN-NEXT: v_readlane_b32 s38, v45, 4 ; GCN-NEXT: v_readlane_b32 s37, v45, 3 diff --git a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir index ae90948615c6d..dd7d96f9d6e3c 100644 --- a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir +++ b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir @@ -37,7 +37,7 @@ body: | ; MUBUF-LABEL: name: use_restore_frame_reg ; MUBUF: bb.0: ; MUBUF-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; MUBUF-NEXT: liveins: $vgpr1, $vgpr2 + ; MUBUF-NEXT: liveins: $sgpr40, $sgpr41, $vgpr1 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 @@ -68,18 +68,12 @@ body: | ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 - ; MUBUF-NEXT: $sgpr4 = COPY $sgpr33 + ; MUBUF-NEXT: $sgpr40 = frame-setup COPY $sgpr33 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION register $sgpr33, $sgpr40 ; MUBUF-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; MUBUF-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc - ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 9961728, implicit-def dead $scc - ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5) - ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr2, 9961728 - ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 - ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr33, $vgpr2, 0, 32 - ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 - ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr34, $vgpr2, 1, 32 + ; MUBUF-NEXT: $sgpr41 = frame-setup COPY $sgpr34 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION register $sgpr34, $sgpr41 ; MUBUF-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $sgpr33 ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 11010048, implicit-def dead $scc @@ -87,36 +81,31 @@ body: | ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; MUBUF-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; MUBUF-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec - ; MUBUF-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 155648, killed $vgpr3, implicit $exec - ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; MUBUF-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; MUBUF-NEXT: $vgpr2 = V_ADD_U32_e32 155648, killed $vgpr2, implicit $exec + ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; MUBUF-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc ; MUBUF-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: bb.1: ; MUBUF-NEXT: successors: %bb.2(0x80000000) - ; MUBUF-NEXT: liveins: $vgpr2 + ; MUBUF-NEXT: liveins: $sgpr40, $sgpr41 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: S_NOP 0 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: bb.2: - ; MUBUF-NEXT: liveins: $vgpr2 + ; MUBUF-NEXT: liveins: $sgpr40, $sgpr41 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; MUBUF-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; MUBUF-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 - ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 9961728, implicit-def dead $scc - ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5) - ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; MUBUF-NEXT: $sgpr34 = frame-destroy COPY $sgpr41 ; MUBUF-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_register $sgpr32 - ; MUBUF-NEXT: $sgpr33 = COPY $sgpr4 + ; MUBUF-NEXT: $sgpr33 = frame-destroy COPY $sgpr40 ; MUBUF-NEXT: S_ENDPGM 0 ; ; FLATSCR-LABEL: name: use_restore_frame_reg ; FLATSCR: bb.0: ; FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; FLATSCR-NEXT: liveins: $vgpr1, $vgpr2 + ; FLATSCR-NEXT: liveins: $sgpr40, $sgpr41, $vgpr1 ; FLATSCR-NEXT: {{ $}} ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02 ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 @@ -147,58 +136,41 @@ body: | ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 - ; FLATSCR-NEXT: $sgpr4 = COPY $sgpr33 + ; FLATSCR-NEXT: $sgpr40 = frame-setup COPY $sgpr33 + ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION register $sgpr33, $sgpr40 ; FLATSCR-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; FLATSCR-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc - ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 155652, implicit-def dead $scc - ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.20, addrspace 5) - ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr2, 9961728 - ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 - ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr33, $vgpr2, 0, 32 - ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 - ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION llvm_vector_registers $sgpr34, $vgpr2, 1, 32 + ; FLATSCR-NEXT: $sgpr41 = frame-setup COPY $sgpr34 + ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION register $sgpr34, $sgpr41 ; FLATSCR-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; FLATSCR-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x41, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02 ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 172032, implicit-def dead $scc ; FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; FLATSCR-NEXT: $sgpr33 = S_ADDC_U32 $sgpr33, 8192, implicit-def $scc, implicit $scc - ; FLATSCR-NEXT: S_BITCMP1_B32 $sgpr33, 0, implicit-def $scc - ; FLATSCR-NEXT: $sgpr33 = S_BITSET0_B32 0, $sgpr33 - ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; FLATSCR-NEXT: $sgpr33 = S_ADDC_U32 $sgpr33, -8192, implicit-def $scc, implicit $scc - ; FLATSCR-NEXT: S_BITCMP1_B32 $sgpr33, 0, implicit-def $scc - ; FLATSCR-NEXT: $sgpr33 = S_BITSET0_B32 0, $sgpr33 - ; FLATSCR-NEXT: $sgpr33 = S_ADDC_U32 $sgpr33, 155648, implicit-def $scc, implicit $scc - ; FLATSCR-NEXT: S_BITCMP1_B32 $sgpr33, 0, implicit-def $scc - ; FLATSCR-NEXT: $sgpr33 = S_BITSET0_B32 0, $sgpr33 - ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 - ; FLATSCR-NEXT: $sgpr33 = S_ADDC_U32 $sgpr33, -155648, implicit-def $scc, implicit $scc - ; FLATSCR-NEXT: S_BITCMP1_B32 $sgpr33, 0, implicit-def $scc - ; FLATSCR-NEXT: $sgpr33 = S_BITSET0_B32 0, $sgpr33 + ; FLATSCR-NEXT: $sgpr42 = S_ADDC_U32 $sgpr33, 8192, implicit-def $scc, implicit $scc + ; FLATSCR-NEXT: S_BITCMP1_B32 $sgpr42, 0, implicit-def $scc + ; FLATSCR-NEXT: $sgpr42 = S_BITSET0_B32 0, $sgpr42 + ; FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr42, implicit $exec + ; FLATSCR-NEXT: $sgpr42 = S_ADDC_U32 $sgpr33, 155648, implicit-def $scc, implicit $scc + ; FLATSCR-NEXT: S_BITCMP1_B32 $sgpr42, 0, implicit-def $scc + ; FLATSCR-NEXT: $sgpr42 = S_BITSET0_B32 0, $sgpr42 + ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $sgpr42, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; FLATSCR-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc ; FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc ; FLATSCR-NEXT: {{ $}} ; FLATSCR-NEXT: bb.1: ; FLATSCR-NEXT: successors: %bb.2(0x80000000) - ; FLATSCR-NEXT: liveins: $vgpr2 + ; FLATSCR-NEXT: liveins: $sgpr40, $sgpr41 ; FLATSCR-NEXT: {{ $}} ; FLATSCR-NEXT: S_NOP 0 ; FLATSCR-NEXT: {{ $}} ; FLATSCR-NEXT: bb.2: - ; FLATSCR-NEXT: liveins: $vgpr2 + ; FLATSCR-NEXT: liveins: $sgpr40, $sgpr41 ; FLATSCR-NEXT: {{ $}} ; FLATSCR-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 - ; FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 - ; FLATSCR-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 - ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 155652, implicit-def dead $scc - ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.20, addrspace 5) - ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 + ; FLATSCR-NEXT: $sgpr34 = frame-destroy COPY $sgpr41 ; FLATSCR-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02 - ; FLATSCR-NEXT: $sgpr33 = COPY $sgpr4 + ; FLATSCR-NEXT: $sgpr33 = frame-destroy COPY $sgpr40 ; FLATSCR-NEXT: S_ENDPGM 0 bb.0: liveins: $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll index bad0be16e75cc..79ec4b8831679 100644 --- a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll @@ -69,7 +69,6 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %ret ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: swap: @@ -94,7 +93,6 @@ define half @swap(half %a, half %b, i32 %i) { ; GFX12-FAKE16-NEXT: ; %bb.2: ; %ret ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir index d91ee54215924..1eabe62e7710e 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec,amdgpu-wait-sgpr-hazards -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s --- | @mem = internal unnamed_addr addrspace(4) constant [4 x <4 x i32>] [<4 x i32> , <4 x i32> , <4 x i32> , <4 x i32> ] diff --git a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard-attrs.mir b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard-attrs.mir new file mode 100644 index 0000000000000..fe3cb30793390 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard-attrs.mir @@ -0,0 +1,347 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass post-RA-hazard-rec,amdgpu-wait-sgpr-hazards -o - %s | FileCheck -check-prefix=GCN %s + +--- | + define amdgpu_gs void @hazard_disable() #0 { ret void } + define amdgpu_gs void @hazard_enable() #1 { ret void } + define amdgpu_cs void @hazard_calls() #2 { ret void } + define void @hazard_callee1() #2 { ret void } + define void @hazard_callee2() #2 { ret void } + define amdgpu_cs void @hazard_cull_vmem() #3 { ret void } + define amdgpu_cs void @hazard_cull_vmem2() #4 { ret void } + define amdgpu_cs void @hazard_cull_sample() #3 { ret void } + define amdgpu_cs void @hazard_cull_bvh() #3 { ret void } + define amdgpu_cs void @hazard_nocull_scratch() #3 { ret void } + define amdgpu_cs void @hazard_cull_global() #3 { ret void } + define amdgpu_cs void @hazard_nocull_flat() #3 { ret void } + + attributes #0 = { "amdgpu-sgpr-hazard-wait"="0" } + attributes #1 = { "amdgpu-sgpr-hazard-wait"="1" } + attributes #2 = { "amdgpu-sgpr-hazard-boundary-cull" } + attributes #3 = { "amdgpu-sgpr-hazard-mem-wait-cull" "amdgpu-sgpr-hazard-mem-wait-cull-threshold"="1" } + attributes #4 = { "amdgpu-sgpr-hazard-mem-wait-cull" "amdgpu-sgpr-hazard-mem-wait-cull-threshold"="2" } +... + +--- +name: hazard_disable +body: | + bb.0: + ; GCN-LABEL: name: hazard_disable + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_enable +body: | + bb.0: + ; GCN-LABEL: name: hazard_enable + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_calls +frameInfo: + hasCalls: true +body: | + ; GCN-LABEL: name: hazard_calls + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 0, $sgpr4, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 1, $sgpr8, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 2, $sgpr16, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 3, $sgpr18, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 4, $sgpr20, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 5, $sgpr22, $vgpr0 + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.3, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: $sgpr16 = S_MOV_B32 0 + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_SETPC_B64 $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: $sgpr18 = S_MOV_B32 0 + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_SETPC_B64_return $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.4: + ; GCN-NEXT: successors: %bb.5(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vcc_lo = S_MOV_B32 0 + ; GCN-NEXT: $sgpr20 = S_MOV_B32 0 + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 + ; GCN-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.5: + ; GCN-NEXT: successors: %bb.6(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.6: + ; GCN-NEXT: $sgpr22 = S_MOV_B32 $sgpr8 + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + $vgpr0 = V_WRITELANE_B32 0, $sgpr4, $vgpr0 + $vgpr0 = V_WRITELANE_B32 1, $sgpr8, $vgpr0 + $vgpr0 = V_WRITELANE_B32 2, $sgpr16, $vgpr0 + $vgpr0 = V_WRITELANE_B32 3, $sgpr18, $vgpr0 + $vgpr0 = V_WRITELANE_B32 4, $sgpr20, $vgpr0 + $vgpr0 = V_WRITELANE_B32 5, $sgpr22, $vgpr0 + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + S_CBRANCH_SCC0 %bb.3, implicit $scc + S_BRANCH %bb.4 + + bb.2: + $sgpr16 = S_MOV_B32 0 + S_SETPC_B64 $sgpr0_sgpr1 + + bb.3: + $sgpr18 = S_MOV_B32 0 + S_SETPC_B64_return $sgpr0_sgpr1 + + bb.4: + $vcc_lo = S_MOV_B32 0 + $sgpr20 = S_MOV_B32 0 + $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 + $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc + + bb.5: + $sgpr8_sgpr9 = S_CALL_B64 0 + + bb.6: + $sgpr22 = S_MOV_B32 $sgpr8 + S_ENDPGM 0 +... + +--- +name: hazard_callee1 +body: | + bb.0: + ; GCN-LABEL: name: hazard_callee1 + ; GCN: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_SETPC_B64_return $sgpr30_sgpr31 +... + +--- +name: hazard_callee2 +body: | + bb.0: + ; GCN-LABEL: name: hazard_callee2 + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_SETPC_B64_return $sgpr30_sgpr31 +... + +--- +name: hazard_cull_vmem +body: | + bb.0: + ; GCN-LABEL: name: hazard_cull_vmem + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_vmem2 +body: | + bb.0: + ; GCN-LABEL: name: hazard_cull_vmem2 + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr2, 0, implicit $exec + ; GCN-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + S_WAIT_LOADCNT 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr2, 0, implicit $exec + $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_sample +body: | + bb.0: + ; GCN-LABEL: name: hazard_cull_sample + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAIT_SAMPLECNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) + S_WAIT_SAMPLECNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_bvh +body: | + bb.0: + ; GCN-LABEL: name: hazard_cull_bvh + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAIT_BVHCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) + S_WAIT_BVHCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_nocull_scratch +body: | + bb.0: + ; GCN-LABEL: name: hazard_nocull_scratch + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_global +body: | + bb.0: + ; GCN-LABEL: name: hazard_cull_global + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_nocull_flat +body: | + bb.0: + ; GCN-LABEL: name: hazard_nocull_flat + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir index 2aa16dd904766..04f7e480764e6 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O0 %s -# RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O2 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec,amdgpu-wait-sgpr-hazards -amdgpu-sgpr-hazard-boundary-cull=0 -o - %s | FileCheck -check-prefixes=GCN,NOBC,NOMEMC %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec,amdgpu-wait-sgpr-hazards -amdgpu-sgpr-hazard-boundary-cull=1 -o - %s | FileCheck -check-prefixes=GCN,BC,NOMEMC %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec,amdgpu-wait-sgpr-hazards -amdgpu-sgpr-hazard-boundary-cull=0 -amdgpu-sgpr-hazard-mem-wait-cull=1 -amdgpu-sgpr-hazard-mem-wait-cull-threshold=1 -o - %s | FileCheck -check-prefixes=GCN,NOBC,MEMC %s --- | @mem = internal unnamed_addr addrspace(4) constant [4 x <4 x i32>] [<4 x i32> , <4 x i32> , <4 x i32> , <4 x i32> ] @@ -12,6 +13,7 @@ define amdgpu_gs void @hazard_vcc1() { ret void } define amdgpu_gs void @hazard_vcc2() { ret void } define amdgpu_gs void @hazard_vcc3() { ret void } + define amdgpu_gs void @hazard_merge_vcc() { ret void } define amdgpu_gs void @hazard_addc1() { ret void } define amdgpu_gs void @hazard_addc2() { ret void } define amdgpu_gs void @hazard_addc3() { ret void } @@ -28,26 +30,33 @@ define amdgpu_gs void @hazard_post_order2() { ret void } define amdgpu_gs void @hazard_post_order_cycle() { ret void } define amdgpu_cs void @hazard_calls() { ret void } + define void @hazard_callee1() { ret void } + define void @hazard_callee2() { ret void } + define amdgpu_cs void @hazard_carry_vcc() { ret void } + define amdgpu_cs void @hazard_carry_vcc_no_hazard() { ret void } + define amdgpu_cs void @hazard_carry_sgpr() { ret void } + define amdgpu_cs void @hazard_carry_sgpr_no_hazard1() { ret void } + define amdgpu_cs void @hazard_carry_sgpr_no_hazard2() { ret void } + define amdgpu_cs void @hazard_carry_sgpr_no_hazard3() { ret void } + define amdgpu_cs void @hazard_cull_vmem() { ret void } + define amdgpu_cs void @hazard_cull_sample() { ret void } + define amdgpu_cs void @hazard_cull_bvh() { ret void } + define amdgpu_cs void @hazard_nocull_scratch() { ret void } + define amdgpu_cs void @hazard_cull_global() { ret void } + define amdgpu_cs void @hazard_nocull_flat() { ret void } + define amdgpu_cs void @hazard_existing_cull() { ret void } ... --- name: hazard_getpc1 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_getpc1 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_getpc1 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_getpc1 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec $sgpr0_sgpr1 = S_GETPC_B64 $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc @@ -58,20 +67,12 @@ body: | name: hazard_getpc2 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_getpc2 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec - ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_getpc2 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec - ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_getpc2 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec $sgpr0_sgpr1 = S_GETPC_B64 $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc @@ -82,27 +83,15 @@ body: | name: hazard_getpc3 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_getpc3 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O0-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { - ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 20, implicit-def $scc, implicit $scc - ; GCN-O0-NEXT: } - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_getpc3 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O2-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { - ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc - ; GCN-O2-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 16, implicit-def $scc, implicit $scc - ; GCN-O2-NEXT: } - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_getpc3 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc + ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 16, implicit-def $scc, implicit $scc + ; GCN-NEXT: } + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec BUNDLE implicit-def $sgpr0_sgpr1 { $sgpr0_sgpr1 = S_GETPC_B64 @@ -116,31 +105,17 @@ body: | name: hazard_getpc4 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_getpc4 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O0-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { - ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr1 = S_SEXT_I32_I16 $sgpr1 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 28, implicit-def $scc, implicit $scc - ; GCN-O0-NEXT: } - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_getpc4 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O2-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { - ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr1 = S_SEXT_I32_I16 $sgpr1 - ; GCN-O2-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 24, implicit-def $scc, implicit $scc - ; GCN-O2-NEXT: } - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_getpc4 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr1 = S_SEXT_I32_I16 $sgpr1 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 24, implicit-def $scc, implicit $scc + ; GCN-NEXT: } + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec BUNDLE implicit-def $sgpr0_sgpr1 { $sgpr0_sgpr1 = S_GETPC_B64 @@ -155,20 +130,12 @@ body: | name: hazard_vcc1 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_vcc1 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec - ; GCN-O0-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_vcc1 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec - ; GCN-O2-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_vcc1 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec + ; GCN-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc @@ -179,20 +146,12 @@ body: | name: hazard_vcc2 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_vcc2 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec - ; GCN-O0-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_vcc2 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec - ; GCN-O2-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_vcc2 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc @@ -203,21 +162,56 @@ body: | name: hazard_vcc3 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_vcc3 - ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec - ; GCN-O0-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_vcc3 - ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec - ; GCN-O2-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_vcc3 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_merge_vcc +body: | + ; GCN-LABEL: name: hazard_merge_vcc + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vcc_lo = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: S_WAITCNT_DEPCTR 65532 + ; GCN-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + bb.0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + S_CBRANCH_SCC0 %bb.1, implicit $scc + S_BRANCH %bb.2 + bb.1: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc + S_BRANCH %bb.3 + bb.2: + $vcc_lo = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec + S_BRANCH %bb.3 + bb.3: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc, implicit $exec S_ENDPGM 0 ... @@ -226,20 +220,12 @@ body: | name: hazard_addc1 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_addc1 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_addc1 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_addc1 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc @@ -250,20 +236,12 @@ body: | name: hazard_addc2 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_addc2 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_addc2 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_addc2 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec + ; GCN-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc @@ -274,20 +252,12 @@ body: | name: hazard_addc3 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_addc3 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_addc3 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_addc3 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc @@ -298,20 +268,12 @@ body: | name: hazard_addc4 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_addc4 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_addc4 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_addc4 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec + ; GCN-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc @@ -322,26 +284,14 @@ body: | name: hazard_addc5 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_addc5 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr32 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_addc5 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr32 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_addc5 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr16 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr32 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr16 = S_MOV_B32 0 $sgpr32 = S_MOV_B32 0 @@ -354,35 +304,17 @@ body: | name: hazard_addc6 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_addc6 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr32 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr48 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr80 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr96 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_addc6 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr32 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr48 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr80 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr96 = S_MOV_B32 0 - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_addc6 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr16 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr32 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr48 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr80 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr96 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr16 = S_MOV_B32 0 $sgpr32 = S_MOV_B32 0 @@ -398,19 +330,12 @@ body: | name: hazard_vaddc1 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_vaddc1 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_vaddc1 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_vaddc1 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec @@ -421,36 +346,20 @@ body: | name: hazard_gap1 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_gap1 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_gap1 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_gap1 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec S_NOP 0 S_NOP 0 @@ -469,36 +378,20 @@ body: | name: hazard_gap2 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_gap2 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_gap2 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_gap2 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc S_NOP 0 @@ -517,41 +410,19 @@ body: | name: hazard_gap3 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_gap3 - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_gap3 - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_gap3 + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc + ; GCN-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc + ; GCN-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc + ; GCN-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc + ; GCN-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc + ; GCN-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc @@ -569,49 +440,22 @@ body: | name: hazard_gap4_no_hazard body: | bb.0: - ; GCN-O0-LABEL: name: hazard_gap4_no_hazard - ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_gap4_no_hazard - ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_gap4_no_hazard + ; GCN: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc + ; GCN-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc + ; GCN-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc + ; GCN-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc + ; GCN-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc + ; GCN-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc + ; GCN-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc + ; GCN-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc + ; GCN-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc + ; GCN-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc @@ -632,19 +476,11 @@ body: | name: hazard_valu_write1_no_hazard body: | bb.0: - ; GCN-O0-LABEL: name: hazard_valu_write1_no_hazard - ; GCN-O0: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec - ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_valu_write1_no_hazard - ; GCN-O2: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec - ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc - ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_valu_write1_no_hazard + ; GCN: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec + ; GCN-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; GCN-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc @@ -655,19 +491,11 @@ body: | name: hazard_post_order1 body: | bb.0: - ; GCN-O0-LABEL: name: hazard_post_order1 - ; GCN-O0: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_post_order1 - ; GCN-O2: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_post_order1 + ; GCN: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 $sgpr0_sgpr1 = S_GETPC_B64 $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec @@ -677,31 +505,17 @@ body: | --- name: hazard_post_order2 body: | - ; GCN-O0-LABEL: name: hazard_post_order2 - ; GCN-O0: bb.0: - ; GCN-O0-NEXT: successors: %bb.1(0x80000000) - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_BRANCH %bb.1 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.1: - ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_post_order2 - ; GCN-O2: bb.0: - ; GCN-O2-NEXT: successors: %bb.1(0x80000000) - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: S_BRANCH %bb.1 - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.1: - ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_post_order2 + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 bb.0: $sgpr0_sgpr1 = S_GETPC_B64 $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc @@ -715,50 +529,27 @@ body: | --- name: hazard_post_order_cycle body: | - ; GCN-O0-LABEL: name: hazard_post_order_cycle - ; GCN-O0: bb.0: - ; GCN-O0-NEXT: successors: %bb.1(0x80000000) - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: S_NOP 0 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.1: - ; GCN-O0-NEXT: successors: %bb.2(0x80000000) - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.2: - ; GCN-O0-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O0-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.3: - ; GCN-O0-NEXT: S_ENDPGM 0 - ; - ; GCN-O2-LABEL: name: hazard_post_order_cycle - ; GCN-O2: bb.0: - ; GCN-O2-NEXT: successors: %bb.1(0x80000000) - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: S_NOP 0 - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.1: - ; GCN-O2-NEXT: successors: %bb.2(0x80000000) - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.2: - ; GCN-O2-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec - ; GCN-O2-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.3: - ; GCN-O2-NEXT: S_ENDPGM 0 + ; GCN-LABEL: name: hazard_post_order_cycle + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: S_ENDPGM 0 bb.0: S_NOP 0 @@ -779,84 +570,485 @@ name: hazard_calls frameInfo: hasCalls: true body: | - ; GCN-O0-LABEL: name: hazard_calls - ; GCN-O0: bb.0: - ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_SETPC_B64 $sgpr0_sgpr1 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.1: - ; GCN-O0-NEXT: $sgpr18 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_SETPC_B64_return $sgpr0_sgpr1 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.2: - ; GCN-O0-NEXT: successors: %bb.3(0x80000000) - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: $sgpr20 = S_MOV_B32 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.3: - ; GCN-O0-NEXT: successors: %bb.4(0x80000000) - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: {{ $}} - ; GCN-O0-NEXT: bb.4: - ; GCN-O0-NEXT: $sgpr22 = S_MOV_B32 $sgpr8 - ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O0-NEXT: S_ENDPGM 0 + ; NOBC-LABEL: name: hazard_calls + ; NOBC: bb.0: + ; NOBC-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: $vgpr0 = V_WRITELANE_B32 0, $sgpr4, $vgpr0 + ; NOBC-NEXT: $vgpr0 = V_WRITELANE_B32 1, $sgpr8, $vgpr0 + ; NOBC-NEXT: $vgpr0 = V_WRITELANE_B32 2, $sgpr16, $vgpr0 + ; NOBC-NEXT: $vgpr0 = V_WRITELANE_B32 3, $sgpr18, $vgpr0 + ; NOBC-NEXT: $vgpr0 = V_WRITELANE_B32 4, $sgpr20, $vgpr0 + ; NOBC-NEXT: $vgpr0 = V_WRITELANE_B32 5, $sgpr22, $vgpr0 + ; NOBC-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; NOBC-NEXT: S_BRANCH %bb.1 + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: bb.1: + ; NOBC-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: S_CBRANCH_SCC0 %bb.3, implicit $scc + ; NOBC-NEXT: S_BRANCH %bb.4 + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: bb.2: + ; NOBC-NEXT: $sgpr16 = S_MOV_B32 0 + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: S_SETPC_B64 $sgpr0_sgpr1 + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: bb.3: + ; NOBC-NEXT: $sgpr18 = S_MOV_B32 0 + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: S_SETPC_B64_return $sgpr0_sgpr1 + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: bb.4: + ; NOBC-NEXT: successors: %bb.5(0x80000000) + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: $vcc_lo = S_MOV_B32 0 + ; NOBC-NEXT: $sgpr20 = S_MOV_B32 0 + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: bb.5: + ; NOBC-NEXT: successors: %bb.6(0x80000000) + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0 + ; NOBC-NEXT: {{ $}} + ; NOBC-NEXT: bb.6: + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: $sgpr22 = S_MOV_B32 $sgpr8 + ; NOBC-NEXT: S_ENDPGM 0 ; - ; GCN-O2-LABEL: name: hazard_calls - ; GCN-O2: bb.0: - ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0 - ; GCN-O2-NEXT: S_SETPC_B64 $sgpr0_sgpr1 - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.1: - ; GCN-O2-NEXT: $sgpr18 = S_MOV_B32 0 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: S_SETPC_B64_return $sgpr0_sgpr1 - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.2: - ; GCN-O2-NEXT: successors: %bb.3(0x80000000) - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: $sgpr20 = S_MOV_B32 0 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.3: - ; GCN-O2-NEXT: successors: %bb.4(0x80000000) - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0 - ; GCN-O2-NEXT: {{ $}} - ; GCN-O2-NEXT: bb.4: - ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534 - ; GCN-O2-NEXT: $sgpr22 = S_MOV_B32 $sgpr8 - ; GCN-O2-NEXT: S_ENDPGM 0 + ; BC-LABEL: name: hazard_calls + ; BC: bb.0: + ; BC-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; BC-NEXT: {{ $}} + ; BC-NEXT: $vgpr0 = V_WRITELANE_B32 0, $sgpr4, $vgpr0 + ; BC-NEXT: $vgpr0 = V_WRITELANE_B32 1, $sgpr8, $vgpr0 + ; BC-NEXT: $vgpr0 = V_WRITELANE_B32 2, $sgpr16, $vgpr0 + ; BC-NEXT: $vgpr0 = V_WRITELANE_B32 3, $sgpr18, $vgpr0 + ; BC-NEXT: $vgpr0 = V_WRITELANE_B32 4, $sgpr20, $vgpr0 + ; BC-NEXT: $vgpr0 = V_WRITELANE_B32 5, $sgpr22, $vgpr0 + ; BC-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; BC-NEXT: S_BRANCH %bb.1 + ; BC-NEXT: {{ $}} + ; BC-NEXT: bb.1: + ; BC-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; BC-NEXT: {{ $}} + ; BC-NEXT: S_CBRANCH_SCC0 %bb.3, implicit $scc + ; BC-NEXT: S_BRANCH %bb.4 + ; BC-NEXT: {{ $}} + ; BC-NEXT: bb.2: + ; BC-NEXT: $sgpr16 = S_MOV_B32 0 + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: S_WAITCNT_DEPCTR 65534 + ; BC-NEXT: S_SETPC_B64 $sgpr0_sgpr1 + ; BC-NEXT: {{ $}} + ; BC-NEXT: bb.3: + ; BC-NEXT: $sgpr18 = S_MOV_B32 0 + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: S_WAITCNT_DEPCTR 65534 + ; BC-NEXT: S_SETPC_B64_return $sgpr0_sgpr1 + ; BC-NEXT: {{ $}} + ; BC-NEXT: bb.4: + ; BC-NEXT: successors: %bb.5(0x80000000) + ; BC-NEXT: {{ $}} + ; BC-NEXT: $vcc_lo = S_MOV_B32 0 + ; BC-NEXT: $sgpr20 = S_MOV_B32 0 + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: S_WAITCNT_DEPCTR 65534 + ; BC-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 + ; BC-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc + ; BC-NEXT: {{ $}} + ; BC-NEXT: bb.5: + ; BC-NEXT: successors: %bb.6(0x80000000) + ; BC-NEXT: {{ $}} + ; BC-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0 + ; BC-NEXT: {{ $}} + ; BC-NEXT: bb.6: + ; BC-NEXT: $sgpr22 = S_MOV_B32 $sgpr8 + ; BC-NEXT: S_ENDPGM 0 bb.0: + $vgpr0 = V_WRITELANE_B32 0, $sgpr4, $vgpr0 + $vgpr0 = V_WRITELANE_B32 1, $sgpr8, $vgpr0 + $vgpr0 = V_WRITELANE_B32 2, $sgpr16, $vgpr0 + $vgpr0 = V_WRITELANE_B32 3, $sgpr18, $vgpr0 + $vgpr0 = V_WRITELANE_B32 4, $sgpr20, $vgpr0 + $vgpr0 = V_WRITELANE_B32 5, $sgpr22, $vgpr0 + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + S_CBRANCH_SCC0 %bb.3, implicit $scc + S_BRANCH %bb.4 + + bb.2: $sgpr16 = S_MOV_B32 0 S_SETPC_B64 $sgpr0_sgpr1 - bb.1: + bb.3: $sgpr18 = S_MOV_B32 0 S_SETPC_B64_return $sgpr0_sgpr1 - bb.2: + bb.4: + $vcc_lo = S_MOV_B32 0 $sgpr20 = S_MOV_B32 0 $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3 $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc - bb.3: + bb.5: $sgpr8_sgpr9 = S_CALL_B64 0 - bb.4: + bb.6: $sgpr22 = S_MOV_B32 $sgpr8 S_ENDPGM 0 ... + +--- +name: hazard_callee1 +body: | + bb.0: + ; NOBC-LABEL: name: hazard_callee1 + ; NOBC: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + ; + ; BC-LABEL: name: hazard_callee1 + ; BC: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; BC-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; BC-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_SETPC_B64_return $sgpr30_sgpr31 +... + +--- +name: hazard_callee2 +body: | + bb.0: + ; NOBC-LABEL: name: hazard_callee2 + ; NOBC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; NOBC-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; NOBC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOBC-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + ; + ; BC-LABEL: name: hazard_callee2 + ; BC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; BC-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + ; BC-NEXT: S_WAITCNT_DEPCTR 65534 + ; BC-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: DS_NOP implicit $m0, implicit $exec + ; BC-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc + $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc + S_SETPC_B64_return $sgpr30_sgpr31 +... + +--- +name: hazard_carry_vcc +body: | + bb.0: + ; GCN-LABEL: name: hazard_carry_vcc + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $vcc_lo, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, $vgpr1, implicit-def $vcc_lo, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 65533 + ; GCN-NEXT: $vgpr1 = V_ADDC_U32_e32 $vgpr2, $vgpr3, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $vcc_lo, 0, implicit $exec + $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, $vgpr1, implicit-def $vcc, implicit $exec + $vgpr1 = V_ADDC_U32_e32 $vgpr2, $vgpr3, implicit-def $vcc, implicit $vcc, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_carry_vcc_no_hazard +body: | + bb.0: + ; GCN-LABEL: name: hazard_carry_vcc_no_hazard + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $vcc_lo, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, $vgpr1, implicit-def $vcc_lo, implicit $exec + ; GCN-NEXT: $sgpr8 = S_MOV_B32 $vcc_lo + ; GCN-NEXT: $vgpr1 = V_ADDC_U32_e32 $vgpr2, $vgpr3, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $vcc_lo, 0, implicit $exec + $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, $vgpr1, implicit-def $vcc, implicit $exec + $sgpr8 = S_MOV_B32 $vcc_lo + $vgpr1 = V_ADDC_U32_e32 $vgpr2, $vgpr3, implicit-def $vcc, implicit $vcc, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_carry_sgpr +body: | + bb.0: + ; GCN-LABEL: name: hazard_carry_sgpr + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 61951 + ; GCN-NEXT: $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_carry_sgpr_no_hazard1 +body: | + bb.0: + ; GCN-LABEL: name: hazard_carry_sgpr_no_hazard1 + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + ; GCN-NEXT: $sgpr8 = S_MOV_B32 $sgpr0 + ; GCN-NEXT: $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + $sgpr8 = S_MOV_B32 $sgpr0 + $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_carry_sgpr_no_hazard2 +body: | + bb.0: + ; GCN-LABEL: name: hazard_carry_sgpr_no_hazard2 + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_carry_sgpr_no_hazard3 +body: | + bb.0: + ; GCN-LABEL: name: hazard_carry_sgpr_no_hazard3 + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + ; GCN-NEXT: $sgpr8 = S_LOAD_DWORD_IMM $sgpr6_sgpr7, 0, 0 + ; GCN-NEXT: $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0, $sgpr0 = V_ADD_CO_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + $sgpr8 = S_LOAD_DWORD_IMM $sgpr6_sgpr7, 0, 0 + $vgpr1, $sgpr1 = V_ADDC_U32_e64 $vgpr2, $vgpr3, $sgpr0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: hazard_cull_vmem +body: | + bb.0: + ; NOMEMC-LABEL: name: hazard_cull_vmem + ; NOMEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; NOMEMC-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + ; NOMEMC-NEXT: S_WAIT_LOADCNT 0 + ; NOMEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; NOMEMC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOMEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; NOMEMC-NEXT: S_ENDPGM 0 + ; + ; MEMC-LABEL: name: hazard_cull_vmem + ; MEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; MEMC-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: S_WAIT_LOADCNT 0 + ; MEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; MEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; MEMC-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, 0, 0, 0, implicit $exec + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_sample +body: | + bb.0: + ; NOMEMC-LABEL: name: hazard_cull_sample + ; NOMEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; NOMEMC-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) + ; NOMEMC-NEXT: S_WAIT_SAMPLECNT 0 + ; NOMEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; NOMEMC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOMEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; NOMEMC-NEXT: S_ENDPGM 0 + ; + ; MEMC-LABEL: name: hazard_cull_sample + ; MEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; MEMC-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: S_WAIT_SAMPLECNT 0 + ; MEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; MEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; MEMC-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) + S_WAIT_SAMPLECNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_bvh +body: | + bb.0: + ; NOMEMC-LABEL: name: hazard_cull_bvh + ; NOMEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; NOMEMC-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) + ; NOMEMC-NEXT: S_WAIT_BVHCNT 0 + ; NOMEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; NOMEMC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOMEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; NOMEMC-NEXT: S_ENDPGM 0 + ; + ; MEMC-LABEL: name: hazard_cull_bvh + ; MEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; MEMC-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: S_WAIT_BVHCNT 0 + ; MEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; MEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; MEMC-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) + S_WAIT_BVHCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_nocull_scratch +body: | + bb.0: + ; GCN-LABEL: name: hazard_nocull_scratch + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_cull_global +body: | + bb.0: + ; NOMEMC-LABEL: name: hazard_cull_global + ; NOMEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; NOMEMC-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; NOMEMC-NEXT: S_WAIT_LOADCNT 0 + ; NOMEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; NOMEMC-NEXT: S_WAITCNT_DEPCTR 65534 + ; NOMEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; NOMEMC-NEXT: S_ENDPGM 0 + ; + ; MEMC-LABEL: name: hazard_cull_global + ; MEMC: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; MEMC-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: DS_NOP implicit $m0, implicit $exec + ; MEMC-NEXT: S_WAIT_LOADCNT 0 + ; MEMC-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; MEMC-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; MEMC-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_nocull_flat +body: | + bb.0: + ; GCN-LABEL: name: hazard_nocull_flat + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + S_WAIT_LOADCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: hazard_existing_cull +body: | + bb.0: + ; GCN-LABEL: name: hazard_existing_cull + ; GCN: $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: DS_NOP implicit $m0, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $sgpr0 = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec + DS_NOP implicit $m0, implicit $exec + DS_NOP implicit $m0, implicit $exec + DS_NOP implicit $m0, implicit $exec + DS_NOP implicit $m0, implicit $exec + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir index e3b96c08348fc..d49381c9b8aff 100644 --- a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir @@ -1,12 +1,11 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN %s # GCN-LABEL: name: hazard_vcmpx_permlane16 # GCN: V_CMPX_LE_F32_nosdst_e32 # GCN: S_ADD_U32 # GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec -# GFX12-NEXT: S_WAITCNT_DEPCTR # GCN-NEXT: V_PERMLANE16_B32_e64 --- name: hazard_vcmpx_permlane16 @@ -52,6 +51,7 @@ body: | # GCN: V_CMPX_LE_F32_nosdst_e32 # GCN: V_NOP # GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec +# GFX12-NEXT: S_WAITCNT_DEPCTR # GCN-NEXT: V_PERMLANE16_B32_e64 --- name: hazard_vcmpx_permlane16_v_nop @@ -129,7 +129,6 @@ body: | # GCN: V_CMPX_LE_F32_nosdst_e32 # GCN: S_ADD_U32 # GCN-NEXT: dead $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec -# GFX12-NEXT: S_WAITCNT_DEPCTR # GCN-NEXT: V_PERMLANE16_B32_e64 --- name: hazard_vcmpx_permlane16_undef_src @@ -152,7 +151,6 @@ body: | # GCN: V_CMPX_LE_F32_nosdst_e64 # GCN: S_ADD_U32 # GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec -# GFX12-NEXT: S_WAITCNT_DEPCTR # GCN-NEXT: V_PERMLANE16_B32_e64 --- name: hazard_vcmpx_e64_permlane16 diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll index 0af8c95da8d8b..0b277f88e513e 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -15,14 +15,11 @@ target datalayout = "A5" ; GCN-ALLOCA: buffer_load_dword ; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1 -; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0 +; GCN-PROMOTE: s_cselect_b32 [[IND1:s[0-9]+]], 1, 0 ; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]] -; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 +; GCN-PROMOTE: s_cselect_b32 [[IND2:s[0-9]+]], [[IND1]], 2 ; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc -; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc +; GCN-PROMOTE: s_cselect_b32 [[IND3:s[0-9]+]], [[IND2]], 3 ; GCN-PROMOTE: ScratchSize: 0 define amdgpu_kernel void @vector_read_alloca_bitcast(ptr addrspace(1) %out, i32 %index) { @@ -51,7 +48,7 @@ entry: ; GCN-ALLOCA-COUNT-5: buffer_store_dword ; GCN-ALLOCA: buffer_load_dword -; GCN-PROMOTE-COUNT-7: v_cndmask +; GCN-PROMOTE-COUNT-7: s_cselect_b32 ; GCN-PROMOTE: ScratchSize: 0 @@ -292,14 +289,11 @@ entry: ; GCN-ALLOCA: buffer_load_dword ; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1 -; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0 +; GCN-PROMOTE: s_cselect_b32 [[IND1:s[0-9]+]], 1, 0 ; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]] -; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 +; GCN-PROMOTE: s_cselect_b32 [[IND2:s[0-9]+]], [[IND1]], 2 ; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc -; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc +; GCN-PROMOTE: s_cselect_b32 [[IND3:s[0-9]+]], [[IND2]], 3 ; GCN-PROMOTE: ScratchSize: 0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll index 5ced02f28c977..2ee62d13fcc51 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll @@ -35,34 +35,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX900-NEXT: v_writelane_b32 v63, s37, 1 ; GFX900-NEXT: v_writelane_b32 v63, s38, 2 ; GFX900-NEXT: v_writelane_b32 v63, s39, 3 -; GFX900-NEXT: v_writelane_b32 v63, s40, 4 -; GFX900-NEXT: v_writelane_b32 v63, s41, 5 -; GFX900-NEXT: v_writelane_b32 v63, s42, 6 -; GFX900-NEXT: v_writelane_b32 v63, s43, 7 -; GFX900-NEXT: v_writelane_b32 v63, s44, 8 -; GFX900-NEXT: v_writelane_b32 v63, s45, 9 -; GFX900-NEXT: v_writelane_b32 v63, s46, 10 -; GFX900-NEXT: v_writelane_b32 v63, s47, 11 -; GFX900-NEXT: v_writelane_b32 v63, s48, 12 -; GFX900-NEXT: v_writelane_b32 v63, s49, 13 -; GFX900-NEXT: v_writelane_b32 v63, s50, 14 -; GFX900-NEXT: v_writelane_b32 v63, s51, 15 -; GFX900-NEXT: v_writelane_b32 v63, s52, 16 -; GFX900-NEXT: v_writelane_b32 v63, s53, 17 -; GFX900-NEXT: v_writelane_b32 v63, s54, 18 -; GFX900-NEXT: v_writelane_b32 v63, s55, 19 -; GFX900-NEXT: v_writelane_b32 v63, s56, 20 -; GFX900-NEXT: v_writelane_b32 v63, s57, 21 -; GFX900-NEXT: v_writelane_b32 v63, s58, 22 -; GFX900-NEXT: v_writelane_b32 v63, s59, 23 -; GFX900-NEXT: v_writelane_b32 v63, s60, 24 -; GFX900-NEXT: v_writelane_b32 v63, s61, 25 -; GFX900-NEXT: v_writelane_b32 v63, s62, 26 -; GFX900-NEXT: v_writelane_b32 v63, s63, 27 -; GFX900-NEXT: v_writelane_b32 v63, s64, 28 -; GFX900-NEXT: v_writelane_b32 v63, s65, 29 -; GFX900-NEXT: v_writelane_b32 v63, s66, 30 -; GFX900-NEXT: v_writelane_b32 v63, s67, 31 +; GFX900-NEXT: v_writelane_b32 v63, s48, 4 +; GFX900-NEXT: v_writelane_b32 v63, s49, 5 +; GFX900-NEXT: v_writelane_b32 v63, s50, 6 +; GFX900-NEXT: v_writelane_b32 v63, s51, 7 +; GFX900-NEXT: v_writelane_b32 v63, s52, 8 +; GFX900-NEXT: v_writelane_b32 v63, s53, 9 +; GFX900-NEXT: v_writelane_b32 v63, s54, 10 +; GFX900-NEXT: v_writelane_b32 v63, s55, 11 +; GFX900-NEXT: v_writelane_b32 v63, s64, 12 +; GFX900-NEXT: v_writelane_b32 v63, s65, 13 +; GFX900-NEXT: v_writelane_b32 v63, s66, 14 +; GFX900-NEXT: v_writelane_b32 v63, s67, 15 ; GFX900-NEXT: v_mov_b32_e32 v33, v30 ; GFX900-NEXT: v_mov_b32_e32 v34, v29 ; GFX900-NEXT: v_mov_b32_e32 v35, v28 @@ -160,34 +144,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX900-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec ; GFX900-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: v_readlane_b32 s67, v63, 31 -; GFX900-NEXT: v_readlane_b32 s66, v63, 30 -; GFX900-NEXT: v_readlane_b32 s65, v63, 29 -; GFX900-NEXT: v_readlane_b32 s64, v63, 28 -; GFX900-NEXT: v_readlane_b32 s63, v63, 27 -; GFX900-NEXT: v_readlane_b32 s62, v63, 26 -; GFX900-NEXT: v_readlane_b32 s61, v63, 25 -; GFX900-NEXT: v_readlane_b32 s60, v63, 24 -; GFX900-NEXT: v_readlane_b32 s59, v63, 23 -; GFX900-NEXT: v_readlane_b32 s58, v63, 22 -; GFX900-NEXT: v_readlane_b32 s57, v63, 21 -; GFX900-NEXT: v_readlane_b32 s56, v63, 20 -; GFX900-NEXT: v_readlane_b32 s55, v63, 19 -; GFX900-NEXT: v_readlane_b32 s54, v63, 18 -; GFX900-NEXT: v_readlane_b32 s53, v63, 17 -; GFX900-NEXT: v_readlane_b32 s52, v63, 16 -; GFX900-NEXT: v_readlane_b32 s51, v63, 15 -; GFX900-NEXT: v_readlane_b32 s50, v63, 14 -; GFX900-NEXT: v_readlane_b32 s49, v63, 13 -; GFX900-NEXT: v_readlane_b32 s48, v63, 12 -; GFX900-NEXT: v_readlane_b32 s47, v63, 11 -; GFX900-NEXT: v_readlane_b32 s46, v63, 10 -; GFX900-NEXT: v_readlane_b32 s45, v63, 9 -; GFX900-NEXT: v_readlane_b32 s44, v63, 8 -; GFX900-NEXT: v_readlane_b32 s43, v63, 7 -; GFX900-NEXT: v_readlane_b32 s42, v63, 6 -; GFX900-NEXT: v_readlane_b32 s41, v63, 5 -; GFX900-NEXT: v_readlane_b32 s40, v63, 4 +; GFX900-NEXT: v_readlane_b32 s67, v63, 15 +; GFX900-NEXT: v_readlane_b32 s66, v63, 14 +; GFX900-NEXT: v_readlane_b32 s65, v63, 13 +; GFX900-NEXT: v_readlane_b32 s64, v63, 12 +; GFX900-NEXT: v_readlane_b32 s55, v63, 11 +; GFX900-NEXT: v_readlane_b32 s54, v63, 10 +; GFX900-NEXT: v_readlane_b32 s53, v63, 9 +; GFX900-NEXT: v_readlane_b32 s52, v63, 8 +; GFX900-NEXT: v_readlane_b32 s51, v63, 7 +; GFX900-NEXT: v_readlane_b32 s50, v63, 6 +; GFX900-NEXT: v_readlane_b32 s49, v63, 5 +; GFX900-NEXT: v_readlane_b32 s48, v63, 4 ; GFX900-NEXT: v_readlane_b32 s39, v63, 3 ; GFX900-NEXT: v_readlane_b32 s38, v63, 2 ; GFX900-NEXT: v_readlane_b32 s37, v63, 1 @@ -238,34 +206,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX906-NEXT: v_writelane_b32 v63, s37, 1 ; GFX906-NEXT: v_writelane_b32 v63, s38, 2 ; GFX906-NEXT: v_writelane_b32 v63, s39, 3 -; GFX906-NEXT: v_writelane_b32 v63, s40, 4 -; GFX906-NEXT: v_writelane_b32 v63, s41, 5 -; GFX906-NEXT: v_writelane_b32 v63, s42, 6 -; GFX906-NEXT: v_writelane_b32 v63, s43, 7 -; GFX906-NEXT: v_writelane_b32 v63, s44, 8 -; GFX906-NEXT: v_writelane_b32 v63, s45, 9 -; GFX906-NEXT: v_writelane_b32 v63, s46, 10 -; GFX906-NEXT: v_writelane_b32 v63, s47, 11 -; GFX906-NEXT: v_writelane_b32 v63, s48, 12 -; GFX906-NEXT: v_writelane_b32 v63, s49, 13 -; GFX906-NEXT: v_writelane_b32 v63, s50, 14 -; GFX906-NEXT: v_writelane_b32 v63, s51, 15 -; GFX906-NEXT: v_writelane_b32 v63, s52, 16 -; GFX906-NEXT: v_writelane_b32 v63, s53, 17 -; GFX906-NEXT: v_writelane_b32 v63, s54, 18 -; GFX906-NEXT: v_writelane_b32 v63, s55, 19 -; GFX906-NEXT: v_writelane_b32 v63, s56, 20 -; GFX906-NEXT: v_writelane_b32 v63, s57, 21 -; GFX906-NEXT: v_writelane_b32 v63, s58, 22 -; GFX906-NEXT: v_writelane_b32 v63, s59, 23 -; GFX906-NEXT: v_writelane_b32 v63, s60, 24 -; GFX906-NEXT: v_writelane_b32 v63, s61, 25 -; GFX906-NEXT: v_writelane_b32 v63, s62, 26 -; GFX906-NEXT: v_writelane_b32 v63, s63, 27 -; GFX906-NEXT: v_writelane_b32 v63, s64, 28 -; GFX906-NEXT: v_writelane_b32 v63, s65, 29 -; GFX906-NEXT: v_writelane_b32 v63, s66, 30 -; GFX906-NEXT: v_writelane_b32 v63, s67, 31 +; GFX906-NEXT: v_writelane_b32 v63, s48, 4 +; GFX906-NEXT: v_writelane_b32 v63, s49, 5 +; GFX906-NEXT: v_writelane_b32 v63, s50, 6 +; GFX906-NEXT: v_writelane_b32 v63, s51, 7 +; GFX906-NEXT: v_writelane_b32 v63, s52, 8 +; GFX906-NEXT: v_writelane_b32 v63, s53, 9 +; GFX906-NEXT: v_writelane_b32 v63, s54, 10 +; GFX906-NEXT: v_writelane_b32 v63, s55, 11 +; GFX906-NEXT: v_writelane_b32 v63, s64, 12 +; GFX906-NEXT: v_writelane_b32 v63, s65, 13 +; GFX906-NEXT: v_writelane_b32 v63, s66, 14 +; GFX906-NEXT: v_writelane_b32 v63, s67, 15 ; GFX906-NEXT: v_mov_b32_e32 v33, v30 ; GFX906-NEXT: v_mov_b32_e32 v34, v29 ; GFX906-NEXT: v_mov_b32_e32 v35, v28 @@ -363,34 +315,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX906-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec ; GFX906-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: v_readlane_b32 s67, v63, 31 -; GFX906-NEXT: v_readlane_b32 s66, v63, 30 -; GFX906-NEXT: v_readlane_b32 s65, v63, 29 -; GFX906-NEXT: v_readlane_b32 s64, v63, 28 -; GFX906-NEXT: v_readlane_b32 s63, v63, 27 -; GFX906-NEXT: v_readlane_b32 s62, v63, 26 -; GFX906-NEXT: v_readlane_b32 s61, v63, 25 -; GFX906-NEXT: v_readlane_b32 s60, v63, 24 -; GFX906-NEXT: v_readlane_b32 s59, v63, 23 -; GFX906-NEXT: v_readlane_b32 s58, v63, 22 -; GFX906-NEXT: v_readlane_b32 s57, v63, 21 -; GFX906-NEXT: v_readlane_b32 s56, v63, 20 -; GFX906-NEXT: v_readlane_b32 s55, v63, 19 -; GFX906-NEXT: v_readlane_b32 s54, v63, 18 -; GFX906-NEXT: v_readlane_b32 s53, v63, 17 -; GFX906-NEXT: v_readlane_b32 s52, v63, 16 -; GFX906-NEXT: v_readlane_b32 s51, v63, 15 -; GFX906-NEXT: v_readlane_b32 s50, v63, 14 -; GFX906-NEXT: v_readlane_b32 s49, v63, 13 -; GFX906-NEXT: v_readlane_b32 s48, v63, 12 -; GFX906-NEXT: v_readlane_b32 s47, v63, 11 -; GFX906-NEXT: v_readlane_b32 s46, v63, 10 -; GFX906-NEXT: v_readlane_b32 s45, v63, 9 -; GFX906-NEXT: v_readlane_b32 s44, v63, 8 -; GFX906-NEXT: v_readlane_b32 s43, v63, 7 -; GFX906-NEXT: v_readlane_b32 s42, v63, 6 -; GFX906-NEXT: v_readlane_b32 s41, v63, 5 -; GFX906-NEXT: v_readlane_b32 s40, v63, 4 +; GFX906-NEXT: v_readlane_b32 s67, v63, 15 +; GFX906-NEXT: v_readlane_b32 s66, v63, 14 +; GFX906-NEXT: v_readlane_b32 s65, v63, 13 +; GFX906-NEXT: v_readlane_b32 s64, v63, 12 +; GFX906-NEXT: v_readlane_b32 s55, v63, 11 +; GFX906-NEXT: v_readlane_b32 s54, v63, 10 +; GFX906-NEXT: v_readlane_b32 s53, v63, 9 +; GFX906-NEXT: v_readlane_b32 s52, v63, 8 +; GFX906-NEXT: v_readlane_b32 s51, v63, 7 +; GFX906-NEXT: v_readlane_b32 s50, v63, 6 +; GFX906-NEXT: v_readlane_b32 s49, v63, 5 +; GFX906-NEXT: v_readlane_b32 s48, v63, 4 ; GFX906-NEXT: v_readlane_b32 s39, v63, 3 ; GFX906-NEXT: v_readlane_b32 s38, v63, 2 ; GFX906-NEXT: v_readlane_b32 s37, v63, 1 @@ -440,34 +376,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX908-NEXT: v_writelane_b32 v62, s37, 1 ; GFX908-NEXT: v_writelane_b32 v62, s38, 2 ; GFX908-NEXT: v_writelane_b32 v62, s39, 3 -; GFX908-NEXT: v_writelane_b32 v62, s40, 4 -; GFX908-NEXT: v_writelane_b32 v62, s41, 5 -; GFX908-NEXT: v_writelane_b32 v62, s42, 6 -; GFX908-NEXT: v_writelane_b32 v62, s43, 7 -; GFX908-NEXT: v_writelane_b32 v62, s44, 8 -; GFX908-NEXT: v_writelane_b32 v62, s45, 9 -; GFX908-NEXT: v_writelane_b32 v62, s46, 10 -; GFX908-NEXT: v_writelane_b32 v62, s47, 11 -; GFX908-NEXT: v_writelane_b32 v62, s48, 12 -; GFX908-NEXT: v_writelane_b32 v62, s49, 13 -; GFX908-NEXT: v_writelane_b32 v62, s50, 14 -; GFX908-NEXT: v_writelane_b32 v62, s51, 15 -; GFX908-NEXT: v_writelane_b32 v62, s52, 16 -; GFX908-NEXT: v_writelane_b32 v62, s53, 17 -; GFX908-NEXT: v_writelane_b32 v62, s54, 18 -; GFX908-NEXT: v_writelane_b32 v62, s55, 19 -; GFX908-NEXT: v_writelane_b32 v62, s56, 20 -; GFX908-NEXT: v_writelane_b32 v62, s57, 21 -; GFX908-NEXT: v_writelane_b32 v62, s58, 22 -; GFX908-NEXT: v_writelane_b32 v62, s59, 23 -; GFX908-NEXT: v_writelane_b32 v62, s60, 24 -; GFX908-NEXT: v_writelane_b32 v62, s61, 25 -; GFX908-NEXT: v_writelane_b32 v62, s62, 26 -; GFX908-NEXT: v_writelane_b32 v62, s63, 27 -; GFX908-NEXT: v_writelane_b32 v62, s64, 28 -; GFX908-NEXT: v_writelane_b32 v62, s65, 29 -; GFX908-NEXT: v_writelane_b32 v62, s66, 30 -; GFX908-NEXT: v_writelane_b32 v62, s67, 31 +; GFX908-NEXT: v_writelane_b32 v62, s48, 4 +; GFX908-NEXT: v_writelane_b32 v62, s49, 5 +; GFX908-NEXT: v_writelane_b32 v62, s50, 6 +; GFX908-NEXT: v_writelane_b32 v62, s51, 7 +; GFX908-NEXT: v_writelane_b32 v62, s52, 8 +; GFX908-NEXT: v_writelane_b32 v62, s53, 9 +; GFX908-NEXT: v_writelane_b32 v62, s54, 10 +; GFX908-NEXT: v_writelane_b32 v62, s55, 11 +; GFX908-NEXT: v_writelane_b32 v62, s64, 12 +; GFX908-NEXT: v_writelane_b32 v62, s65, 13 +; GFX908-NEXT: v_writelane_b32 v62, s66, 14 +; GFX908-NEXT: v_writelane_b32 v62, s67, 15 ; GFX908-NEXT: v_mov_b32_e32 v33, v30 ; GFX908-NEXT: v_mov_b32_e32 v34, v29 ; GFX908-NEXT: v_mov_b32_e32 v35, v28 @@ -569,34 +489,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX908-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec ; GFX908-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: v_readlane_b32 s67, v62, 31 -; GFX908-NEXT: v_readlane_b32 s66, v62, 30 -; GFX908-NEXT: v_readlane_b32 s65, v62, 29 -; GFX908-NEXT: v_readlane_b32 s64, v62, 28 -; GFX908-NEXT: v_readlane_b32 s63, v62, 27 -; GFX908-NEXT: v_readlane_b32 s62, v62, 26 -; GFX908-NEXT: v_readlane_b32 s61, v62, 25 -; GFX908-NEXT: v_readlane_b32 s60, v62, 24 -; GFX908-NEXT: v_readlane_b32 s59, v62, 23 -; GFX908-NEXT: v_readlane_b32 s58, v62, 22 -; GFX908-NEXT: v_readlane_b32 s57, v62, 21 -; GFX908-NEXT: v_readlane_b32 s56, v62, 20 -; GFX908-NEXT: v_readlane_b32 s55, v62, 19 -; GFX908-NEXT: v_readlane_b32 s54, v62, 18 -; GFX908-NEXT: v_readlane_b32 s53, v62, 17 -; GFX908-NEXT: v_readlane_b32 s52, v62, 16 -; GFX908-NEXT: v_readlane_b32 s51, v62, 15 -; GFX908-NEXT: v_readlane_b32 s50, v62, 14 -; GFX908-NEXT: v_readlane_b32 s49, v62, 13 -; GFX908-NEXT: v_readlane_b32 s48, v62, 12 -; GFX908-NEXT: v_readlane_b32 s47, v62, 11 -; GFX908-NEXT: v_readlane_b32 s46, v62, 10 -; GFX908-NEXT: v_readlane_b32 s45, v62, 9 -; GFX908-NEXT: v_readlane_b32 s44, v62, 8 -; GFX908-NEXT: v_readlane_b32 s43, v62, 7 -; GFX908-NEXT: v_readlane_b32 s42, v62, 6 -; GFX908-NEXT: v_readlane_b32 s41, v62, 5 -; GFX908-NEXT: v_readlane_b32 s40, v62, 4 +; GFX908-NEXT: v_readlane_b32 s67, v62, 15 +; GFX908-NEXT: v_readlane_b32 s66, v62, 14 +; GFX908-NEXT: v_readlane_b32 s65, v62, 13 +; GFX908-NEXT: v_readlane_b32 s64, v62, 12 +; GFX908-NEXT: v_readlane_b32 s55, v62, 11 +; GFX908-NEXT: v_readlane_b32 s54, v62, 10 +; GFX908-NEXT: v_readlane_b32 s53, v62, 9 +; GFX908-NEXT: v_readlane_b32 s52, v62, 8 +; GFX908-NEXT: v_readlane_b32 s51, v62, 7 +; GFX908-NEXT: v_readlane_b32 s50, v62, 6 +; GFX908-NEXT: v_readlane_b32 s49, v62, 5 +; GFX908-NEXT: v_readlane_b32 s48, v62, 4 ; GFX908-NEXT: v_readlane_b32 s39, v62, 3 ; GFX908-NEXT: v_readlane_b32 s38, v62, 2 ; GFX908-NEXT: v_readlane_b32 s37, v62, 1 @@ -646,34 +550,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX90a-NEXT: v_writelane_b32 v63, s37, 1 ; GFX90a-NEXT: v_writelane_b32 v63, s38, 2 ; GFX90a-NEXT: v_writelane_b32 v63, s39, 3 -; GFX90a-NEXT: v_writelane_b32 v63, s40, 4 -; GFX90a-NEXT: v_writelane_b32 v63, s41, 5 -; GFX90a-NEXT: v_writelane_b32 v63, s42, 6 -; GFX90a-NEXT: v_writelane_b32 v63, s43, 7 -; GFX90a-NEXT: v_writelane_b32 v63, s44, 8 -; GFX90a-NEXT: v_writelane_b32 v63, s45, 9 -; GFX90a-NEXT: v_writelane_b32 v63, s46, 10 -; GFX90a-NEXT: v_writelane_b32 v63, s47, 11 -; GFX90a-NEXT: v_writelane_b32 v63, s48, 12 -; GFX90a-NEXT: v_writelane_b32 v63, s49, 13 -; GFX90a-NEXT: v_writelane_b32 v63, s50, 14 -; GFX90a-NEXT: v_writelane_b32 v63, s51, 15 -; GFX90a-NEXT: v_writelane_b32 v63, s52, 16 -; GFX90a-NEXT: v_writelane_b32 v63, s53, 17 -; GFX90a-NEXT: v_writelane_b32 v63, s54, 18 -; GFX90a-NEXT: v_writelane_b32 v63, s55, 19 -; GFX90a-NEXT: v_writelane_b32 v63, s56, 20 -; GFX90a-NEXT: v_writelane_b32 v63, s57, 21 -; GFX90a-NEXT: v_writelane_b32 v63, s58, 22 -; GFX90a-NEXT: v_writelane_b32 v63, s59, 23 -; GFX90a-NEXT: v_writelane_b32 v63, s60, 24 -; GFX90a-NEXT: v_writelane_b32 v63, s61, 25 -; GFX90a-NEXT: v_writelane_b32 v63, s62, 26 -; GFX90a-NEXT: v_writelane_b32 v63, s63, 27 -; GFX90a-NEXT: v_writelane_b32 v63, s64, 28 -; GFX90a-NEXT: v_writelane_b32 v63, s65, 29 -; GFX90a-NEXT: v_writelane_b32 v63, s66, 30 -; GFX90a-NEXT: v_writelane_b32 v63, s67, 31 +; GFX90a-NEXT: v_writelane_b32 v63, s48, 4 +; GFX90a-NEXT: v_writelane_b32 v63, s49, 5 +; GFX90a-NEXT: v_writelane_b32 v63, s50, 6 +; GFX90a-NEXT: v_writelane_b32 v63, s51, 7 +; GFX90a-NEXT: v_writelane_b32 v63, s52, 8 +; GFX90a-NEXT: v_writelane_b32 v63, s53, 9 +; GFX90a-NEXT: v_writelane_b32 v63, s54, 10 +; GFX90a-NEXT: v_writelane_b32 v63, s55, 11 +; GFX90a-NEXT: v_writelane_b32 v63, s64, 12 +; GFX90a-NEXT: v_writelane_b32 v63, s65, 13 +; GFX90a-NEXT: v_writelane_b32 v63, s66, 14 +; GFX90a-NEXT: v_writelane_b32 v63, s67, 15 ; GFX90a-NEXT: v_mov_b32_e32 v33, v30 ; GFX90a-NEXT: v_mov_b32_e32 v34, v29 ; GFX90a-NEXT: v_mov_b32_e32 v35, v28 @@ -771,34 +659,18 @@ define i32 @test_tuple(<16 x i64> %0) { ; GFX90a-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec ; GFX90a-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_readlane_b32 s67, v63, 31 -; GFX90a-NEXT: v_readlane_b32 s66, v63, 30 -; GFX90a-NEXT: v_readlane_b32 s65, v63, 29 -; GFX90a-NEXT: v_readlane_b32 s64, v63, 28 -; GFX90a-NEXT: v_readlane_b32 s63, v63, 27 -; GFX90a-NEXT: v_readlane_b32 s62, v63, 26 -; GFX90a-NEXT: v_readlane_b32 s61, v63, 25 -; GFX90a-NEXT: v_readlane_b32 s60, v63, 24 -; GFX90a-NEXT: v_readlane_b32 s59, v63, 23 -; GFX90a-NEXT: v_readlane_b32 s58, v63, 22 -; GFX90a-NEXT: v_readlane_b32 s57, v63, 21 -; GFX90a-NEXT: v_readlane_b32 s56, v63, 20 -; GFX90a-NEXT: v_readlane_b32 s55, v63, 19 -; GFX90a-NEXT: v_readlane_b32 s54, v63, 18 -; GFX90a-NEXT: v_readlane_b32 s53, v63, 17 -; GFX90a-NEXT: v_readlane_b32 s52, v63, 16 -; GFX90a-NEXT: v_readlane_b32 s51, v63, 15 -; GFX90a-NEXT: v_readlane_b32 s50, v63, 14 -; GFX90a-NEXT: v_readlane_b32 s49, v63, 13 -; GFX90a-NEXT: v_readlane_b32 s48, v63, 12 -; GFX90a-NEXT: v_readlane_b32 s47, v63, 11 -; GFX90a-NEXT: v_readlane_b32 s46, v63, 10 -; GFX90a-NEXT: v_readlane_b32 s45, v63, 9 -; GFX90a-NEXT: v_readlane_b32 s44, v63, 8 -; GFX90a-NEXT: v_readlane_b32 s43, v63, 7 -; GFX90a-NEXT: v_readlane_b32 s42, v63, 6 -; GFX90a-NEXT: v_readlane_b32 s41, v63, 5 -; GFX90a-NEXT: v_readlane_b32 s40, v63, 4 +; GFX90a-NEXT: v_readlane_b32 s67, v63, 15 +; GFX90a-NEXT: v_readlane_b32 s66, v63, 14 +; GFX90a-NEXT: v_readlane_b32 s65, v63, 13 +; GFX90a-NEXT: v_readlane_b32 s64, v63, 12 +; GFX90a-NEXT: v_readlane_b32 s55, v63, 11 +; GFX90a-NEXT: v_readlane_b32 s54, v63, 10 +; GFX90a-NEXT: v_readlane_b32 s53, v63, 9 +; GFX90a-NEXT: v_readlane_b32 s52, v63, 8 +; GFX90a-NEXT: v_readlane_b32 s51, v63, 7 +; GFX90a-NEXT: v_readlane_b32 s50, v63, 6 +; GFX90a-NEXT: v_readlane_b32 s49, v63, 5 +; GFX90a-NEXT: v_readlane_b32 s48, v63, 4 ; GFX90a-NEXT: v_readlane_b32 s39, v63, 3 ; GFX90a-NEXT: v_readlane_b32 s38, v63, 2 ; GFX90a-NEXT: v_readlane_b32 s37, v63, 1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 3a49c9b23f59e..86685c961299b 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -73,8 +73,7 @@ define amdgpu_ps float @else2(i32 %z, float %v) #0 { ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %16:vgpr_32, %bb.0, %5, %bb.3 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %16:vgpr_32, %bb.0, [[COPY]], %bb.3 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %15:vgpr_32, %bb.0, %4, %bb.3 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -91,10 +90,10 @@ define amdgpu_ps float @else2(i32 %z, float %v) #0 { ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.4.end: - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, [[V_ADD_F32_e64_]], %bb.2 - ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[V_ADD_F32_e64_]], %bb.2 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.1, [[V_ADD_F32_e64_]], %bb.2 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[V_ADD_F32_e64_]], %bb.2 ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[PHI2]], 0, killed [[PHI3]], 0, 0, implicit $mode, implicit $exec + ; SI-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[PHI1]], 0, killed [[PHI2]], 0, 0, implicit $mode, implicit $exec ; SI-NEXT: $vgpr0 = COPY killed [[V_ADD_F32_e64_1]] ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 main_body: @@ -252,8 +251,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: {{ $}} ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.4, [[PHI1]], %bb.2 - ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], killed [[PHI4]], implicit $exec ; SI-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_]], implicit-def $exec, implicit-def dead $scc, implicit $exec @@ -289,8 +288,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: {{ $}} ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %59:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %61:vgpr_32, %bb.8, [[COPY4]], %bb.6 - ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], killed [[PHI6]], implicit $exec ; SI-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_1]], implicit-def $exec, implicit-def dead $scc, implicit $exec @@ -373,8 +372,8 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %54:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 - ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], killed [[PHI3]], implicit $exec ; SI-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_]], implicit-def $exec, implicit-def dead $scc, implicit $exec @@ -409,8 +408,8 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 - ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], killed [[PHI4]], implicit $exec ; SI-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_1]], implicit-def $exec, implicit-def dead $scc, implicit $exec @@ -548,7 +547,7 @@ sw.bb: ; preds = %if.then9 br label %sw.bb18 sw.bb18: ; preds = %sw.bb, %if.then9 - %a.sroa.0.0 = phi <4 x i8> [ %a.sroa.0.0.vecblend, %sw.bb ], [ undef, %if.then9 ] + %a.sroa.0.0 = phi <4 x i8> [ %a.sroa.0.0.vecblend, %sw.bb ], [ poison, %if.then9 ] %a.sroa.0.0.vec.extract61 = shufflevector <4 x i8> %a.sroa.0.0, <4 x i8> zeroinitializer, <3 x i32> %i19 = insertelement <3 x i8> %a.sroa.0.0.vec.extract61, i8 0, i64 0 %i20 = select <3 x i1> zeroinitializer, <3 x i8> zeroinitializer, <3 x i8> %i19 @@ -590,22 +589,22 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: bb.2: ; SI-NEXT: successors: %bb.3(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec ; SI-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec ; SI-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_]], killed [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc - ; SI-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub4, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub5, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub4, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub5, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub4_sub5, implicit $exec ; SI-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_]], killed [[V_CMP_EQ_U64_e64_2]], implicit-def dead $scc - ; SI-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub6, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub7, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub6, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub7, implicit $exec ; SI-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]].sub6_sub7, implicit $exec ; SI-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_1]], killed [[V_CMP_EQ_U64_e64_3]], implicit-def dead $scc @@ -620,12 +619,12 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: bb.4: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub0, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub1, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_8]], %subreg.sub0, [[V_READFIRSTLANE_B32_9]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_4:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE6]], [[GLOBAL_LOAD_DWORDX4_2]].sub0_sub1, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub2, implicit $exec - ; SI-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub3, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub2, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub3, implicit $exec ; SI-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_10]], %subreg.sub0, [[V_READFIRSTLANE_B32_11]], %subreg.sub1 ; SI-NEXT: [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE7]], [[GLOBAL_LOAD_DWORDX4_2]].sub2_sub3, implicit $exec ; SI-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_4]], killed [[V_CMP_EQ_U64_e64_5]], implicit-def dead $scc diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir index 7fcb58bd9434b..9e16b6566192a 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir @@ -492,6 +492,38 @@ body: | ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec @@ -546,6 +578,38 @@ body: | ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX9-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec @@ -601,6 +665,38 @@ body: | ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX10-FLATSCR-NEXT: $vgpr1 = V_ADD_U32_e64 $sgpr32, 8200, 0, implicit $exec @@ -655,6 +751,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; VMEM-GFX8-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec @@ -737,6 +865,38 @@ body: | ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; MUBUF-NEXT: $vgpr2 = V_MOV_B32_e32 8200, implicit $exec @@ -793,6 +953,38 @@ body: | ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX9-FLATSCR-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr32, implicit $exec @@ -849,6 +1041,38 @@ body: | ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX10-FLATSCR-NEXT: $vgpr2 = V_ADD_U32_e64 $sgpr32, 8200, 0, implicit $exec @@ -904,6 +1128,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; VMEM-GFX8-NEXT: $vgpr2 = V_MOV_B32_e32 8200, implicit $exec @@ -988,6 +1244,38 @@ body: | ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; MUBUF-NEXT: $vgpr3 = V_MOV_B32_e32 8200, implicit $exec @@ -1046,6 +1334,38 @@ body: | ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX9-FLATSCR-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr32, implicit $exec @@ -1103,6 +1423,38 @@ body: | ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX10-FLATSCR-NEXT: $vgpr3 = V_ADD_U32_e64 $sgpr32, 8200, 0, implicit $exec @@ -1159,6 +1511,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; VMEM-GFX8-NEXT: $vgpr3 = V_MOV_B32_e32 8200, implicit $exec @@ -1241,6 +1625,38 @@ body: | ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec @@ -1294,6 +1710,38 @@ body: | ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX9-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec @@ -1348,6 +1796,38 @@ body: | ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX10-FLATSCR-NEXT: $vgpr1 = V_ADD_U32_e64 $sgpr32, 8200, 0, implicit $exec @@ -1401,6 +1881,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; VMEM-GFX8-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec @@ -1481,6 +1993,38 @@ body: | ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; MUBUF-NEXT: $vgpr2 = V_MOV_B32_e32 8200, implicit $exec @@ -1535,6 +2079,38 @@ body: | ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX9-FLATSCR-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr32, implicit $exec @@ -1589,6 +2165,38 @@ body: | ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX10-FLATSCR-NEXT: $vgpr2 = V_ADD_U32_e64 $sgpr32, 8200, 0, implicit $exec @@ -1642,6 +2250,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; VMEM-GFX8-NEXT: $vgpr2 = V_MOV_B32_e32 8200, implicit $exec @@ -1723,6 +2363,38 @@ body: | ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; MUBUF-NEXT: $vgpr3 = V_MOV_B32_e32 8200, implicit $exec @@ -1778,6 +2450,38 @@ body: | ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX9-FLATSCR-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr32, implicit $exec @@ -1832,6 +2536,38 @@ body: | ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX10-FLATSCR-NEXT: $vgpr3 = V_ADD_U32_e64 $sgpr32, 8200, 0, implicit $exec @@ -1885,6 +2621,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; VMEM-GFX8-NEXT: $vgpr3 = V_MOV_B32_e32 8200, implicit $exec @@ -1967,6 +2735,38 @@ body: | ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) @@ -2017,6 +2817,38 @@ body: | ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: $sgpr4 = S_ADDC_U32 $sgpr32, 8200, implicit-def $scc, implicit $scc ; GFX9-FLATSCR-NEXT: S_BITCMP1_B32 $sgpr4, 0, implicit-def $scc @@ -2070,6 +2902,38 @@ body: | ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: $sgpr4 = S_ADDC_U32 $sgpr32, 8200, implicit-def $scc, implicit $scc ; GFX10-FLATSCR-NEXT: S_BITCMP1_B32 $sgpr4, 0, implicit-def $scc @@ -2123,6 +2987,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec ; VMEM-GFX8-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) @@ -2203,6 +3099,38 @@ body: | ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; MUBUF-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec @@ -2257,6 +3185,38 @@ body: | ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX9-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX9-FLATSCR-NEXT: $sgpr4 = S_ADDC_U32 $sgpr32, 8200, implicit-def $scc, implicit $scc @@ -2314,6 +3274,38 @@ body: | ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; GFX10-FLATSCR-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) ; GFX10-FLATSCR-NEXT: $sgpr4 = S_ADDC_U32 $sgpr32, 8200, implicit-def $scc, implicit $scc @@ -2371,6 +3363,38 @@ body: | ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr27 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr28 ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr29 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr40 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr41 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr42 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr43 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr44 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr45 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr46 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr47 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr56 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr57 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr58 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr59 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr60 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr61 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr62 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr63 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr72 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr73 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr74 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr75 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr76 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr77 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr78 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr79 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr88 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr89 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr90 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr91 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr92 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr93 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr94 + ; VMEM-GFX8-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr95 ; VMEM-GFX8-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; VMEM-GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; VMEM-GFX8-NEXT: $vgpr1 = V_MOV_B32_e32 8200, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index b5e4bcd049c42..8da7430d0a135 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-coerce-illegal-types=1 < %s | FileCheck --check-prefix=FEATURE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=DEFAULT %s define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v3i8_liveout: @@ -32,6 +33,67 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[6:7] offset:2 ; GFX906-NEXT: global_store_short v1, v4, s[6:7] ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: v3i8_liveout: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; FEATURE-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; FEATURE-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; FEATURE-NEXT: v_mov_b32_e32 v3, 8 +; FEATURE-NEXT: s_mov_b32 s4, 0xff0000 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_load_dword v4, v2, s[0:1] +; FEATURE-NEXT: v_mov_b32_e32 v1, 0 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; FEATURE-NEXT: v_and_or_b32 v4, v4, s4, v5 +; FEATURE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; FEATURE-NEXT: s_cbranch_execz .LBB0_2 +; FEATURE-NEXT: ; %bb.1: ; %bb.1 +; FEATURE-NEXT: global_load_dword v0, v2, s[2:3] +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; FEATURE-NEXT: v_and_or_b32 v4, v0, s4, v2 +; FEATURE-NEXT: .LBB0_2: ; %bb.2 +; FEATURE-NEXT: s_or_b64 exec, exec, s[0:1] +; FEATURE-NEXT: global_store_byte_d16_hi v1, v4, s[6:7] offset:2 +; FEATURE-NEXT: global_store_short v1, v4, s[6:7] +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: v3i8_liveout: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; DEFAULT-NEXT: v_mov_b32_e32 v3, 8 +; DEFAULT-NEXT: s_mov_b32 s4, 0xff0000 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_load_dword v4, v2, s[0:1] +; DEFAULT-NEXT: v_mov_b32_e32 v1, 0 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; DEFAULT-NEXT: v_and_or_b32 v4, v4, s4, v5 +; DEFAULT-NEXT: s_and_saveexec_b64 s[0:1], vcc +; DEFAULT-NEXT: s_cbranch_execz .LBB0_2 +; DEFAULT-NEXT: ; %bb.1: ; %bb.1 +; DEFAULT-NEXT: global_load_dword v0, v2, s[2:3] +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; DEFAULT-NEXT: v_and_or_b32 v4, v0, s4, v2 +; DEFAULT-NEXT: .LBB0_2: ; %bb.2 +; DEFAULT-NEXT: s_or_b64 exec, exec, s[0:1] +; DEFAULT-NEXT: global_store_byte_d16_hi v1, v4, s[6:7] offset:2 +; DEFAULT-NEXT: global_store_short v1, v4, s[6:7] +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -68,6 +130,43 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dword v1, v2, s[6:7] ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: v4i8_liveout: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; FEATURE-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; FEATURE-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; FEATURE-NEXT: v_mov_b32_e32 v1, 0 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_load_dword v2, v3, s[0:1] +; FEATURE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; FEATURE-NEXT: s_cbranch_execz .LBB1_2 +; FEATURE-NEXT: ; %bb.1: ; %bb.1 +; FEATURE-NEXT: global_load_dword v2, v3, s[2:3] +; FEATURE-NEXT: .LBB1_2: ; %bb.2 +; FEATURE-NEXT: s_or_b64 exec, exec, s[0:1] +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: global_store_dword v1, v2, s[6:7] +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: v4i8_liveout: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; DEFAULT-NEXT: v_mov_b32_e32 v1, 0 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_load_dword v2, v3, s[0:1] +; DEFAULT-NEXT: s_and_saveexec_b64 s[0:1], vcc +; DEFAULT-NEXT: s_cbranch_execz .LBB1_2 +; DEFAULT-NEXT: ; %bb.1: ; %bb.1 +; DEFAULT-NEXT: global_load_dword v2, v3, s[2:3] +; DEFAULT-NEXT: .LBB1_2: ; %bb.2 +; DEFAULT-NEXT: s_or_b64 exec, exec, s[0:1] +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: global_store_dword v1, v2, s[6:7] +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -108,6 +207,51 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: global_store_byte v3, v2, s[6:7] offset:4 ; GFX906-NEXT: global_store_dword v3, v1, s[6:7] ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: v5i8_liveout: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; FEATURE-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; FEATURE-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; FEATURE-NEXT: v_mov_b32_e32 v3, 0 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: v_and_b32_e32 v2, 0xff, v2 +; FEATURE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; FEATURE-NEXT: s_cbranch_execz .LBB2_2 +; FEATURE-NEXT: ; %bb.1: ; %bb.1 +; FEATURE-NEXT: global_load_dwordx2 v[1:2], v4, s[2:3] +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: v_and_b32_e32 v2, 0xff, v2 +; FEATURE-NEXT: .LBB2_2: ; %bb.2 +; FEATURE-NEXT: s_or_b64 exec, exec, s[0:1] +; FEATURE-NEXT: global_store_byte v3, v2, s[6:7] offset:4 +; FEATURE-NEXT: global_store_dword v3, v1, s[6:7] +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: v5i8_liveout: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; DEFAULT-NEXT: v_mov_b32_e32 v3, 0 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: v_and_b32_e32 v2, 0xff, v2 +; DEFAULT-NEXT: s_and_saveexec_b64 s[0:1], vcc +; DEFAULT-NEXT: s_cbranch_execz .LBB2_2 +; DEFAULT-NEXT: ; %bb.1: ; %bb.1 +; DEFAULT-NEXT: global_load_dwordx2 v[1:2], v4, s[2:3] +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: v_and_b32_e32 v2, 0xff, v2 +; DEFAULT-NEXT: .LBB2_2: ; %bb.2 +; DEFAULT-NEXT: s_or_b64 exec, exec, s[0:1] +; DEFAULT-NEXT: global_store_byte v3, v2, s[6:7] offset:4 +; DEFAULT-NEXT: global_store_dword v3, v1, s[6:7] +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -144,6 +288,43 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[6:7] ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: v8i8_liveout: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; FEATURE-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; FEATURE-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; FEATURE-NEXT: v_mov_b32_e32 v3, 0 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] +; FEATURE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; FEATURE-NEXT: s_cbranch_execz .LBB3_2 +; FEATURE-NEXT: ; %bb.1: ; %bb.1 +; FEATURE-NEXT: global_load_dwordx2 v[1:2], v4, s[2:3] +; FEATURE-NEXT: .LBB3_2: ; %bb.2 +; FEATURE-NEXT: s_or_b64 exec, exec, s[0:1] +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: global_store_dwordx2 v3, v[1:2], s[6:7] +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: v8i8_liveout: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; DEFAULT-NEXT: v_mov_b32_e32 v3, 0 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] +; DEFAULT-NEXT: s_and_saveexec_b64 s[0:1], vcc +; DEFAULT-NEXT: s_cbranch_execz .LBB3_2 +; DEFAULT-NEXT: ; %bb.1: ; %bb.1 +; DEFAULT-NEXT: global_load_dwordx2 v[1:2], v4, s[2:3] +; DEFAULT-NEXT: .LBB3_2: ; %bb.2 +; DEFAULT-NEXT: s_or_b64 exec, exec, s[0:1] +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: global_store_dwordx2 v3, v[1:2], s[6:7] +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -180,6 +361,43 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[6:7] ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: v16i8_liveout: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; FEATURE-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; FEATURE-NEXT: v_lshlrev_b32_e32 v6, 4, v0 +; FEATURE-NEXT: v_mov_b32_e32 v5, 0 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_load_dwordx4 v[1:4], v6, s[0:1] +; FEATURE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; FEATURE-NEXT: s_cbranch_execz .LBB4_2 +; FEATURE-NEXT: ; %bb.1: ; %bb.1 +; FEATURE-NEXT: global_load_dwordx4 v[1:4], v6, s[2:3] +; FEATURE-NEXT: .LBB4_2: ; %bb.2 +; FEATURE-NEXT: s_or_b64 exec, exec, s[0:1] +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: global_store_dwordx4 v5, v[1:4], s[6:7] +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: v16i8_liveout: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v6, 4, v0 +; DEFAULT-NEXT: v_mov_b32_e32 v5, 0 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_load_dwordx4 v[1:4], v6, s[0:1] +; DEFAULT-NEXT: s_and_saveexec_b64 s[0:1], vcc +; DEFAULT-NEXT: s_cbranch_execz .LBB4_2 +; DEFAULT-NEXT: ; %bb.1: ; %bb.1 +; DEFAULT-NEXT: global_load_dwordx4 v[1:4], v6, s[2:3] +; DEFAULT-NEXT: .LBB4_2: ; %bb.2 +; DEFAULT-NEXT: s_or_b64 exec, exec, s[0:1] +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: global_store_dwordx4 v5, v[1:4], s[6:7] +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -220,6 +438,51 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[6:7] ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: v32i8_liveout: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; FEATURE-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; FEATURE-NEXT: v_lshlrev_b32_e32 v10, 5, v0 +; FEATURE-NEXT: v_mov_b32_e32 v9, 0 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_load_dwordx4 v[1:4], v10, s[0:1] offset:16 +; FEATURE-NEXT: global_load_dwordx4 v[5:8], v10, s[0:1] +; FEATURE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; FEATURE-NEXT: s_cbranch_execz .LBB5_2 +; FEATURE-NEXT: ; %bb.1: ; %bb.1 +; FEATURE-NEXT: global_load_dwordx4 v[1:4], v10, s[2:3] offset:16 +; FEATURE-NEXT: global_load_dwordx4 v[5:8], v10, s[2:3] +; FEATURE-NEXT: .LBB5_2: ; %bb.2 +; FEATURE-NEXT: s_or_b64 exec, exec, s[0:1] +; FEATURE-NEXT: s_waitcnt vmcnt(1) +; FEATURE-NEXT: global_store_dwordx4 v9, v[1:4], s[6:7] offset:16 +; FEATURE-NEXT: s_waitcnt vmcnt(1) +; FEATURE-NEXT: global_store_dwordx4 v9, v[5:8], s[6:7] +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: v32i8_liveout: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v10, 5, v0 +; DEFAULT-NEXT: v_mov_b32_e32 v9, 0 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_load_dwordx4 v[1:4], v10, s[0:1] offset:16 +; DEFAULT-NEXT: global_load_dwordx4 v[5:8], v10, s[0:1] +; DEFAULT-NEXT: s_and_saveexec_b64 s[0:1], vcc +; DEFAULT-NEXT: s_cbranch_execz .LBB5_2 +; DEFAULT-NEXT: ; %bb.1: ; %bb.1 +; DEFAULT-NEXT: global_load_dwordx4 v[1:4], v10, s[2:3] offset:16 +; DEFAULT-NEXT: global_load_dwordx4 v[5:8], v10, s[2:3] +; DEFAULT-NEXT: .LBB5_2: ; %bb.2 +; DEFAULT-NEXT: s_or_b64 exec, exec, s[0:1] +; DEFAULT-NEXT: s_waitcnt vmcnt(1) +; DEFAULT-NEXT: global_store_dwordx4 v9, v[1:4], s[6:7] offset:16 +; DEFAULT-NEXT: s_waitcnt vmcnt(1) +; DEFAULT-NEXT: global_store_dwordx4 v9, v[5:8], s[6:7] +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -333,6 +596,197 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[6:7] offset:144 ; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[6:7] offset:128 ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: v256i8_liveout: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; FEATURE-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; FEATURE-NEXT: v_lshlrev_b32_e32 v61, 3, v0 +; FEATURE-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; FEATURE-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:240 +; FEATURE-NEXT: s_mov_b32 s14, -1 +; FEATURE-NEXT: s_mov_b32 s15, 0xe00000 +; FEATURE-NEXT: s_add_u32 s12, s12, s11 +; FEATURE-NEXT: s_addc_u32 s13, s13, 0 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; FEATURE-NEXT: v_mov_b32_e32 v4, 0 +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill +; FEATURE-NEXT: s_nop 0 +; FEATURE-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; FEATURE-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; FEATURE-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; FEATURE-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:224 +; FEATURE-NEXT: s_nop 0 +; FEATURE-NEXT: global_load_dwordx4 v[9:12], v61, s[0:1] offset:208 +; FEATURE-NEXT: global_load_dwordx4 v[13:16], v61, s[0:1] offset:192 +; FEATURE-NEXT: global_load_dwordx4 v[17:20], v61, s[0:1] offset:176 +; FEATURE-NEXT: global_load_dwordx4 v[21:24], v61, s[0:1] offset:160 +; FEATURE-NEXT: global_load_dwordx4 v[25:28], v61, s[0:1] offset:144 +; FEATURE-NEXT: global_load_dwordx4 v[29:32], v61, s[0:1] offset:128 +; FEATURE-NEXT: global_load_dwordx4 v[33:36], v61, s[0:1] offset:112 +; FEATURE-NEXT: global_load_dwordx4 v[37:40], v61, s[0:1] offset:96 +; FEATURE-NEXT: global_load_dwordx4 v[41:44], v61, s[0:1] offset:80 +; FEATURE-NEXT: global_load_dwordx4 v[45:48], v61, s[0:1] offset:64 +; FEATURE-NEXT: global_load_dwordx4 v[49:52], v61, s[0:1] offset:48 +; FEATURE-NEXT: global_load_dwordx4 v[53:56], v61, s[0:1] offset:32 +; FEATURE-NEXT: global_load_dwordx4 v[57:60], v61, s[0:1] offset:16 +; FEATURE-NEXT: global_load_dwordx4 v[0:3], v61, s[0:1] +; FEATURE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; FEATURE-NEXT: s_cbranch_execz .LBB6_2 +; FEATURE-NEXT: ; %bb.1: ; %bb.1 +; FEATURE-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] offset:240 +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; FEATURE-NEXT: s_nop 0 +; FEATURE-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; FEATURE-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; FEATURE-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; FEATURE-NEXT: global_load_dwordx4 v[5:8], v61, s[2:3] offset:224 +; FEATURE-NEXT: global_load_dwordx4 v[9:12], v61, s[2:3] offset:208 +; FEATURE-NEXT: global_load_dwordx4 v[13:16], v61, s[2:3] offset:192 +; FEATURE-NEXT: global_load_dwordx4 v[17:20], v61, s[2:3] offset:176 +; FEATURE-NEXT: global_load_dwordx4 v[21:24], v61, s[2:3] offset:160 +; FEATURE-NEXT: global_load_dwordx4 v[25:28], v61, s[2:3] offset:144 +; FEATURE-NEXT: global_load_dwordx4 v[29:32], v61, s[2:3] offset:128 +; FEATURE-NEXT: global_load_dwordx4 v[33:36], v61, s[2:3] offset:112 +; FEATURE-NEXT: global_load_dwordx4 v[37:40], v61, s[2:3] offset:96 +; FEATURE-NEXT: global_load_dwordx4 v[41:44], v61, s[2:3] offset:80 +; FEATURE-NEXT: global_load_dwordx4 v[45:48], v61, s[2:3] offset:64 +; FEATURE-NEXT: global_load_dwordx4 v[49:52], v61, s[2:3] offset:48 +; FEATURE-NEXT: global_load_dwordx4 v[53:56], v61, s[2:3] offset:32 +; FEATURE-NEXT: global_load_dwordx4 v[57:60], v61, s[2:3] offset:16 +; FEATURE-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] +; FEATURE-NEXT: .LBB6_2: ; %bb.2 +; FEATURE-NEXT: s_or_b64 exec, exec, s[0:1] +; FEATURE-NEXT: s_waitcnt vmcnt(7) +; FEATURE-NEXT: global_store_dwordx4 v4, v[33:36], s[6:7] offset:112 +; FEATURE-NEXT: s_waitcnt vmcnt(7) +; FEATURE-NEXT: global_store_dwordx4 v4, v[37:40], s[6:7] offset:96 +; FEATURE-NEXT: s_waitcnt vmcnt(7) +; FEATURE-NEXT: global_store_dwordx4 v4, v[41:44], s[6:7] offset:80 +; FEATURE-NEXT: s_waitcnt vmcnt(7) +; FEATURE-NEXT: global_store_dwordx4 v4, v[45:48], s[6:7] offset:64 +; FEATURE-NEXT: s_waitcnt vmcnt(7) +; FEATURE-NEXT: global_store_dwordx4 v4, v[49:52], s[6:7] offset:48 +; FEATURE-NEXT: s_waitcnt vmcnt(7) +; FEATURE-NEXT: global_store_dwordx4 v4, v[53:56], s[6:7] offset:32 +; FEATURE-NEXT: s_waitcnt vmcnt(7) +; FEATURE-NEXT: global_store_dwordx4 v4, v[57:60], s[6:7] offset:16 +; FEATURE-NEXT: s_waitcnt vmcnt(7) +; FEATURE-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; FEATURE-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; FEATURE-NEXT: s_nop 0 +; FEATURE-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; FEATURE-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; FEATURE-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:240 +; FEATURE-NEXT: global_store_dwordx4 v4, v[5:8], s[6:7] offset:224 +; FEATURE-NEXT: global_store_dwordx4 v4, v[9:12], s[6:7] offset:208 +; FEATURE-NEXT: global_store_dwordx4 v4, v[13:16], s[6:7] offset:192 +; FEATURE-NEXT: global_store_dwordx4 v4, v[17:20], s[6:7] offset:176 +; FEATURE-NEXT: global_store_dwordx4 v4, v[21:24], s[6:7] offset:160 +; FEATURE-NEXT: global_store_dwordx4 v4, v[25:28], s[6:7] offset:144 +; FEATURE-NEXT: global_store_dwordx4 v4, v[29:32], s[6:7] offset:128 +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: v256i8_liveout: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v61, 3, v0 +; DEFAULT-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; DEFAULT-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:240 +; DEFAULT-NEXT: s_mov_b32 s14, -1 +; DEFAULT-NEXT: s_mov_b32 s15, 0xe00000 +; DEFAULT-NEXT: s_add_u32 s12, s12, s11 +; DEFAULT-NEXT: s_addc_u32 s13, s13, 0 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; DEFAULT-NEXT: v_mov_b32_e32 v4, 0 +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill +; DEFAULT-NEXT: s_nop 0 +; DEFAULT-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; DEFAULT-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; DEFAULT-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; DEFAULT-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:224 +; DEFAULT-NEXT: s_nop 0 +; DEFAULT-NEXT: global_load_dwordx4 v[9:12], v61, s[0:1] offset:208 +; DEFAULT-NEXT: global_load_dwordx4 v[13:16], v61, s[0:1] offset:192 +; DEFAULT-NEXT: global_load_dwordx4 v[17:20], v61, s[0:1] offset:176 +; DEFAULT-NEXT: global_load_dwordx4 v[21:24], v61, s[0:1] offset:160 +; DEFAULT-NEXT: global_load_dwordx4 v[25:28], v61, s[0:1] offset:144 +; DEFAULT-NEXT: global_load_dwordx4 v[29:32], v61, s[0:1] offset:128 +; DEFAULT-NEXT: global_load_dwordx4 v[33:36], v61, s[0:1] offset:112 +; DEFAULT-NEXT: global_load_dwordx4 v[37:40], v61, s[0:1] offset:96 +; DEFAULT-NEXT: global_load_dwordx4 v[41:44], v61, s[0:1] offset:80 +; DEFAULT-NEXT: global_load_dwordx4 v[45:48], v61, s[0:1] offset:64 +; DEFAULT-NEXT: global_load_dwordx4 v[49:52], v61, s[0:1] offset:48 +; DEFAULT-NEXT: global_load_dwordx4 v[53:56], v61, s[0:1] offset:32 +; DEFAULT-NEXT: global_load_dwordx4 v[57:60], v61, s[0:1] offset:16 +; DEFAULT-NEXT: global_load_dwordx4 v[0:3], v61, s[0:1] +; DEFAULT-NEXT: s_and_saveexec_b64 s[0:1], vcc +; DEFAULT-NEXT: s_cbranch_execz .LBB6_2 +; DEFAULT-NEXT: ; %bb.1: ; %bb.1 +; DEFAULT-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] offset:240 +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; DEFAULT-NEXT: s_nop 0 +; DEFAULT-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; DEFAULT-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; DEFAULT-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; DEFAULT-NEXT: global_load_dwordx4 v[5:8], v61, s[2:3] offset:224 +; DEFAULT-NEXT: global_load_dwordx4 v[9:12], v61, s[2:3] offset:208 +; DEFAULT-NEXT: global_load_dwordx4 v[13:16], v61, s[2:3] offset:192 +; DEFAULT-NEXT: global_load_dwordx4 v[17:20], v61, s[2:3] offset:176 +; DEFAULT-NEXT: global_load_dwordx4 v[21:24], v61, s[2:3] offset:160 +; DEFAULT-NEXT: global_load_dwordx4 v[25:28], v61, s[2:3] offset:144 +; DEFAULT-NEXT: global_load_dwordx4 v[29:32], v61, s[2:3] offset:128 +; DEFAULT-NEXT: global_load_dwordx4 v[33:36], v61, s[2:3] offset:112 +; DEFAULT-NEXT: global_load_dwordx4 v[37:40], v61, s[2:3] offset:96 +; DEFAULT-NEXT: global_load_dwordx4 v[41:44], v61, s[2:3] offset:80 +; DEFAULT-NEXT: global_load_dwordx4 v[45:48], v61, s[2:3] offset:64 +; DEFAULT-NEXT: global_load_dwordx4 v[49:52], v61, s[2:3] offset:48 +; DEFAULT-NEXT: global_load_dwordx4 v[53:56], v61, s[2:3] offset:32 +; DEFAULT-NEXT: global_load_dwordx4 v[57:60], v61, s[2:3] offset:16 +; DEFAULT-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] +; DEFAULT-NEXT: .LBB6_2: ; %bb.2 +; DEFAULT-NEXT: s_or_b64 exec, exec, s[0:1] +; DEFAULT-NEXT: s_waitcnt vmcnt(7) +; DEFAULT-NEXT: global_store_dwordx4 v4, v[33:36], s[6:7] offset:112 +; DEFAULT-NEXT: s_waitcnt vmcnt(7) +; DEFAULT-NEXT: global_store_dwordx4 v4, v[37:40], s[6:7] offset:96 +; DEFAULT-NEXT: s_waitcnt vmcnt(7) +; DEFAULT-NEXT: global_store_dwordx4 v4, v[41:44], s[6:7] offset:80 +; DEFAULT-NEXT: s_waitcnt vmcnt(7) +; DEFAULT-NEXT: global_store_dwordx4 v4, v[45:48], s[6:7] offset:64 +; DEFAULT-NEXT: s_waitcnt vmcnt(7) +; DEFAULT-NEXT: global_store_dwordx4 v4, v[49:52], s[6:7] offset:48 +; DEFAULT-NEXT: s_waitcnt vmcnt(7) +; DEFAULT-NEXT: global_store_dwordx4 v4, v[53:56], s[6:7] offset:32 +; DEFAULT-NEXT: s_waitcnt vmcnt(7) +; DEFAULT-NEXT: global_store_dwordx4 v4, v[57:60], s[6:7] offset:16 +; DEFAULT-NEXT: s_waitcnt vmcnt(7) +; DEFAULT-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; DEFAULT-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; DEFAULT-NEXT: s_nop 0 +; DEFAULT-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; DEFAULT-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; DEFAULT-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:240 +; DEFAULT-NEXT: global_store_dwordx4 v4, v[5:8], s[6:7] offset:224 +; DEFAULT-NEXT: global_store_dwordx4 v4, v[9:12], s[6:7] offset:208 +; DEFAULT-NEXT: global_store_dwordx4 v4, v[13:16], s[6:7] offset:192 +; DEFAULT-NEXT: global_store_dwordx4 v4, v[17:20], s[6:7] offset:176 +; DEFAULT-NEXT: global_store_dwordx4 v4, v[21:24], s[6:7] offset:160 +; DEFAULT-NEXT: global_store_dwordx4 v4, v[25:28], s[6:7] offset:144 +; DEFAULT-NEXT: global_store_dwordx4 v4, v[29:32], s[6:7] offset:128 +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -378,6 +832,61 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr ; GFX906-NEXT: global_store_dword v1, v0, s[6:7] ; GFX906-NEXT: .LBB7_6: ; %return ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: repeat_successor: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dword s8, s[4:5], 0x24 +; FEATURE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; FEATURE-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: s_cmp_lt_i32 s8, 3 +; FEATURE-NEXT: s_cbranch_scc0 .LBB7_3 +; FEATURE-NEXT: ; %bb.1: ; %LeafBlock +; FEATURE-NEXT: s_cmp_gt_i32 s8, 0 +; FEATURE-NEXT: s_cbranch_scc0 .LBB7_6 +; FEATURE-NEXT: ; %bb.2: +; FEATURE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; FEATURE-NEXT: global_load_dword v0, v0, s[0:1] +; FEATURE-NEXT: s_branch .LBB7_5 +; FEATURE-NEXT: .LBB7_3: ; %LeafBlock5 +; FEATURE-NEXT: s_cmp_eq_u32 s8, 3 +; FEATURE-NEXT: s_cbranch_scc0 .LBB7_6 +; FEATURE-NEXT: ; %bb.4: ; %sw.bb5 +; FEATURE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; FEATURE-NEXT: global_load_dword v0, v0, s[2:3] +; FEATURE-NEXT: .LBB7_5: ; %return.sink.split +; FEATURE-NEXT: v_mov_b32_e32 v1, 0 +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: global_store_dword v1, v0, s[6:7] +; FEATURE-NEXT: .LBB7_6: ; %return +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: repeat_successor: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dword s8, s[4:5], 0x24 +; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: s_cmp_lt_i32 s8, 3 +; DEFAULT-NEXT: s_cbranch_scc0 .LBB7_3 +; DEFAULT-NEXT: ; %bb.1: ; %LeafBlock +; DEFAULT-NEXT: s_cmp_gt_i32 s8, 0 +; DEFAULT-NEXT: s_cbranch_scc0 .LBB7_6 +; DEFAULT-NEXT: ; %bb.2: +; DEFAULT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; DEFAULT-NEXT: global_load_dword v0, v0, s[0:1] +; DEFAULT-NEXT: s_branch .LBB7_5 +; DEFAULT-NEXT: .LBB7_3: ; %LeafBlock5 +; DEFAULT-NEXT: s_cmp_eq_u32 s8, 3 +; DEFAULT-NEXT: s_cbranch_scc0 .LBB7_6 +; DEFAULT-NEXT: ; %bb.4: ; %sw.bb5 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; DEFAULT-NEXT: global_load_dword v0, v0, s[2:3] +; DEFAULT-NEXT: .LBB7_5: ; %return.sink.split +; DEFAULT-NEXT: v_mov_b32_e32 v1, 0 +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: global_store_dword v1, v0, s[6:7] +; DEFAULT-NEXT: .LBB7_6: ; %return +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -433,6 +942,67 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: v8i8_phi_chain: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; FEATURE-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; FEATURE-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_load_dwordx2 v[1:2], v3, s[8:9] +; FEATURE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; FEATURE-NEXT: s_cbranch_execz .LBB8_2 +; FEATURE-NEXT: ; %bb.1: ; %bb.1 +; FEATURE-NEXT: global_load_dwordx2 v[1:2], v3, s[10:11] +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; FEATURE-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; FEATURE-NEXT: s_and_b64 s[4:5], vcc, exec +; FEATURE-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; FEATURE-NEXT: .LBB8_2: ; %Flow +; FEATURE-NEXT: s_or_b64 exec, exec, s[2:3] +; FEATURE-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; FEATURE-NEXT: s_cbranch_execz .LBB8_4 +; FEATURE-NEXT: ; %bb.3: ; %bb.2 +; FEATURE-NEXT: v_mov_b32_e32 v0, 0 +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13] +; FEATURE-NEXT: .LBB8_4: ; %bb.3 +; FEATURE-NEXT: s_or_b64 exec, exec, s[2:3] +; FEATURE-NEXT: v_mov_b32_e32 v0, 0 +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: v8i8_phi_chain: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; DEFAULT-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_load_dwordx2 v[1:2], v3, s[8:9] +; DEFAULT-NEXT: s_and_saveexec_b64 s[2:3], vcc +; DEFAULT-NEXT: s_cbranch_execz .LBB8_2 +; DEFAULT-NEXT: ; %bb.1: ; %bb.1 +; DEFAULT-NEXT: global_load_dwordx2 v[1:2], v3, s[10:11] +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; DEFAULT-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; DEFAULT-NEXT: s_and_b64 s[4:5], vcc, exec +; DEFAULT-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; DEFAULT-NEXT: .LBB8_2: ; %Flow +; DEFAULT-NEXT: s_or_b64 exec, exec, s[2:3] +; DEFAULT-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; DEFAULT-NEXT: s_cbranch_execz .LBB8_4 +; DEFAULT-NEXT: ; %bb.3: ; %bb.2 +; DEFAULT-NEXT: v_mov_b32_e32 v0, 0 +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13] +; DEFAULT-NEXT: .LBB8_4: ; %bb.3 +; DEFAULT-NEXT: s_or_b64 exec, exec, s[2:3] +; DEFAULT-NEXT: v_mov_b32_e32 v0, 0 +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -496,6 +1066,83 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: v8i8_phi_zeroinit: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; FEATURE-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; FEATURE-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; FEATURE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9] +; FEATURE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; FEATURE-NEXT: s_cbranch_execz .LBB9_2 +; FEATURE-NEXT: ; %bb.1: ; %bb.1 +; FEATURE-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11] +; FEATURE-NEXT: s_mov_b32 s4, 0 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; FEATURE-NEXT: s_mov_b32 s5, s4 +; FEATURE-NEXT: s_waitcnt vmcnt(1) +; FEATURE-NEXT: v_mov_b32_e32 v3, s4 +; FEATURE-NEXT: v_mov_b32_e32 v4, s5 +; FEATURE-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; FEATURE-NEXT: s_and_b64 s[4:5], vcc, exec +; FEATURE-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; FEATURE-NEXT: .LBB9_2: ; %Flow +; FEATURE-NEXT: s_or_b64 exec, exec, s[2:3] +; FEATURE-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; FEATURE-NEXT: s_cbranch_execz .LBB9_4 +; FEATURE-NEXT: ; %bb.3: ; %bb.2 +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: v_mov_b32_e32 v1, v3 +; FEATURE-NEXT: v_mov_b32_e32 v0, 0 +; FEATURE-NEXT: v_mov_b32_e32 v2, v4 +; FEATURE-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] +; FEATURE-NEXT: .LBB9_4: ; %bb.3 +; FEATURE-NEXT: s_or_b64 exec, exec, s[2:3] +; FEATURE-NEXT: v_mov_b32_e32 v0, 0 +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: v8i8_phi_zeroinit: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; DEFAULT-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; DEFAULT-NEXT: ; implicit-def: $vgpr1_vgpr2 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9] +; DEFAULT-NEXT: s_and_saveexec_b64 s[2:3], vcc +; DEFAULT-NEXT: s_cbranch_execz .LBB9_2 +; DEFAULT-NEXT: ; %bb.1: ; %bb.1 +; DEFAULT-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11] +; DEFAULT-NEXT: s_mov_b32 s4, 0 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; DEFAULT-NEXT: s_mov_b32 s5, s4 +; DEFAULT-NEXT: s_waitcnt vmcnt(1) +; DEFAULT-NEXT: v_mov_b32_e32 v3, s4 +; DEFAULT-NEXT: v_mov_b32_e32 v4, s5 +; DEFAULT-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; DEFAULT-NEXT: s_and_b64 s[4:5], vcc, exec +; DEFAULT-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; DEFAULT-NEXT: .LBB9_2: ; %Flow +; DEFAULT-NEXT: s_or_b64 exec, exec, s[2:3] +; DEFAULT-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; DEFAULT-NEXT: s_cbranch_execz .LBB9_4 +; DEFAULT-NEXT: ; %bb.3: ; %bb.2 +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: v_mov_b32_e32 v1, v3 +; DEFAULT-NEXT: v_mov_b32_e32 v0, 0 +; DEFAULT-NEXT: v_mov_b32_e32 v2, v4 +; DEFAULT-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] +; DEFAULT-NEXT: .LBB9_4: ; %bb.3 +; DEFAULT-NEXT: s_or_b64 exec, exec, s[2:3] +; DEFAULT-NEXT: v_mov_b32_e32 v0, 0 +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -605,6 +1252,177 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15] ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: v8i8_phi_const: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; FEATURE-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; FEATURE-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; FEATURE-NEXT: ; implicit-def: $vgpr3 +; FEATURE-NEXT: ; implicit-def: $vgpr13 +; FEATURE-NEXT: ; implicit-def: $vgpr11 +; FEATURE-NEXT: ; implicit-def: $vgpr14 +; FEATURE-NEXT: ; implicit-def: $vgpr15 +; FEATURE-NEXT: ; implicit-def: $vgpr12 +; FEATURE-NEXT: ; implicit-def: $vgpr16 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_load_dwordx2 v[1:2], v4, s[8:9] +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; FEATURE-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; FEATURE-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; FEATURE-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; FEATURE-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; FEATURE-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; FEATURE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; FEATURE-NEXT: s_cbranch_execz .LBB10_2 +; FEATURE-NEXT: ; %bb.1: ; %bb.1 +; FEATURE-NEXT: global_load_dwordx2 v[3:4], v4, s[10:11] +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; FEATURE-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; FEATURE-NEXT: s_and_b64 s[4:5], vcc, exec +; FEATURE-NEXT: v_mov_b32_e32 v1, 1 +; FEATURE-NEXT: v_mov_b32_e32 v10, 2 +; FEATURE-NEXT: v_mov_b32_e32 v9, 3 +; FEATURE-NEXT: v_mov_b32_e32 v8, 4 +; FEATURE-NEXT: v_mov_b32_e32 v2, 5 +; FEATURE-NEXT: v_mov_b32_e32 v7, 6 +; FEATURE-NEXT: v_mov_b32_e32 v6, 7 +; FEATURE-NEXT: v_mov_b32_e32 v5, 8 +; FEATURE-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: v_lshrrev_b32_e32 v16, 24, v4 +; FEATURE-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; FEATURE-NEXT: v_lshrrev_b32_e32 v15, 8, v4 +; FEATURE-NEXT: v_lshrrev_b32_e32 v14, 24, v3 +; FEATURE-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; FEATURE-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; FEATURE-NEXT: .LBB10_2: ; %Flow +; FEATURE-NEXT: s_or_b64 exec, exec, s[2:3] +; FEATURE-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; FEATURE-NEXT: s_cbranch_execz .LBB10_4 +; FEATURE-NEXT: ; %bb.3: ; %bb.2 +; FEATURE-NEXT: v_lshlrev_b16_e32 v3, 8, v10 +; FEATURE-NEXT: v_lshlrev_b16_e32 v4, 8, v8 +; FEATURE-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; FEATURE-NEXT: v_lshlrev_b16_e32 v4, 8, v7 +; FEATURE-NEXT: v_lshlrev_b16_e32 v11, 8, v5 +; FEATURE-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v11, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_mov_b32_e32 v0, 0 +; FEATURE-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; FEATURE-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] +; FEATURE-NEXT: v_mov_b32_e32 v3, v1 +; FEATURE-NEXT: v_mov_b32_e32 v13, v10 +; FEATURE-NEXT: v_mov_b32_e32 v11, v9 +; FEATURE-NEXT: v_mov_b32_e32 v14, v8 +; FEATURE-NEXT: v_mov_b32_e32 v4, v2 +; FEATURE-NEXT: v_mov_b32_e32 v15, v7 +; FEATURE-NEXT: v_mov_b32_e32 v12, v6 +; FEATURE-NEXT: v_mov_b32_e32 v16, v5 +; FEATURE-NEXT: .LBB10_4: ; %bb.3 +; FEATURE-NEXT: s_or_b64 exec, exec, s[2:3] +; FEATURE-NEXT: v_lshlrev_b16_e32 v0, 8, v13 +; FEATURE-NEXT: v_lshlrev_b16_e32 v1, 8, v14 +; FEATURE-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; FEATURE-NEXT: v_lshlrev_b16_e32 v1, 8, v15 +; FEATURE-NEXT: v_lshlrev_b16_e32 v3, 8, v16 +; FEATURE-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_mov_b32_e32 v2, 0 +; FEATURE-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; FEATURE-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15] +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: v8i8_phi_const: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; DEFAULT-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; DEFAULT-NEXT: ; implicit-def: $vgpr3 +; DEFAULT-NEXT: ; implicit-def: $vgpr13 +; DEFAULT-NEXT: ; implicit-def: $vgpr11 +; DEFAULT-NEXT: ; implicit-def: $vgpr14 +; DEFAULT-NEXT: ; implicit-def: $vgpr15 +; DEFAULT-NEXT: ; implicit-def: $vgpr12 +; DEFAULT-NEXT: ; implicit-def: $vgpr16 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_load_dwordx2 v[1:2], v4, s[8:9] +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; DEFAULT-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; DEFAULT-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; DEFAULT-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; DEFAULT-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; DEFAULT-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; DEFAULT-NEXT: s_and_saveexec_b64 s[2:3], vcc +; DEFAULT-NEXT: s_cbranch_execz .LBB10_2 +; DEFAULT-NEXT: ; %bb.1: ; %bb.1 +; DEFAULT-NEXT: global_load_dwordx2 v[3:4], v4, s[10:11] +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; DEFAULT-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; DEFAULT-NEXT: s_and_b64 s[4:5], vcc, exec +; DEFAULT-NEXT: v_mov_b32_e32 v1, 1 +; DEFAULT-NEXT: v_mov_b32_e32 v10, 2 +; DEFAULT-NEXT: v_mov_b32_e32 v9, 3 +; DEFAULT-NEXT: v_mov_b32_e32 v8, 4 +; DEFAULT-NEXT: v_mov_b32_e32 v2, 5 +; DEFAULT-NEXT: v_mov_b32_e32 v7, 6 +; DEFAULT-NEXT: v_mov_b32_e32 v6, 7 +; DEFAULT-NEXT: v_mov_b32_e32 v5, 8 +; DEFAULT-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: v_lshrrev_b32_e32 v16, 24, v4 +; DEFAULT-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; DEFAULT-NEXT: v_lshrrev_b32_e32 v15, 8, v4 +; DEFAULT-NEXT: v_lshrrev_b32_e32 v14, 24, v3 +; DEFAULT-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; DEFAULT-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; DEFAULT-NEXT: .LBB10_2: ; %Flow +; DEFAULT-NEXT: s_or_b64 exec, exec, s[2:3] +; DEFAULT-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; DEFAULT-NEXT: s_cbranch_execz .LBB10_4 +; DEFAULT-NEXT: ; %bb.3: ; %bb.2 +; DEFAULT-NEXT: v_lshlrev_b16_e32 v3, 8, v10 +; DEFAULT-NEXT: v_lshlrev_b16_e32 v4, 8, v8 +; DEFAULT-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; DEFAULT-NEXT: v_lshlrev_b16_e32 v4, 8, v7 +; DEFAULT-NEXT: v_lshlrev_b16_e32 v11, 8, v5 +; DEFAULT-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v11, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_mov_b32_e32 v0, 0 +; DEFAULT-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; DEFAULT-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] +; DEFAULT-NEXT: v_mov_b32_e32 v3, v1 +; DEFAULT-NEXT: v_mov_b32_e32 v13, v10 +; DEFAULT-NEXT: v_mov_b32_e32 v11, v9 +; DEFAULT-NEXT: v_mov_b32_e32 v14, v8 +; DEFAULT-NEXT: v_mov_b32_e32 v4, v2 +; DEFAULT-NEXT: v_mov_b32_e32 v15, v7 +; DEFAULT-NEXT: v_mov_b32_e32 v12, v6 +; DEFAULT-NEXT: v_mov_b32_e32 v16, v5 +; DEFAULT-NEXT: .LBB10_4: ; %bb.3 +; DEFAULT-NEXT: s_or_b64 exec, exec, s[2:3] +; DEFAULT-NEXT: v_lshlrev_b16_e32 v0, 8, v13 +; DEFAULT-NEXT: v_lshlrev_b16_e32 v1, 8, v14 +; DEFAULT-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; DEFAULT-NEXT: v_lshlrev_b16_e32 v1, 8, v15 +; DEFAULT-NEXT: v_lshlrev_b16_e32 v3, 8, v16 +; DEFAULT-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_mov_b32_e32 v2, 0 +; DEFAULT-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15] +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -657,6 +1475,63 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx2 v5, v[1:2], s[14:15] ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: v8i8_multi_block: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; FEATURE-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; FEATURE-NEXT: v_mov_b32_e32 v5, 0 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_load_dwordx2 v[3:4], v6, s[8:9] +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: v_mov_b32_e32 v1, v3 +; FEATURE-NEXT: v_mov_b32_e32 v2, v4 +; FEATURE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; FEATURE-NEXT: s_cbranch_execz .LBB11_4 +; FEATURE-NEXT: ; %bb.1: ; %bb.1 +; FEATURE-NEXT: global_load_dwordx2 v[1:2], v6, s[10:11] +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; FEATURE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; FEATURE-NEXT: s_cbranch_execz .LBB11_3 +; FEATURE-NEXT: ; %bb.2: ; %bb.2 +; FEATURE-NEXT: v_mov_b32_e32 v0, 0 +; FEATURE-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] +; FEATURE-NEXT: .LBB11_3: ; %Flow +; FEATURE-NEXT: s_or_b64 exec, exec, s[2:3] +; FEATURE-NEXT: .LBB11_4: ; %bb.3 +; FEATURE-NEXT: s_or_b64 exec, exec, s[0:1] +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: global_store_dwordx2 v5, v[1:2], s[14:15] +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: v8i8_multi_block: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; DEFAULT-NEXT: v_mov_b32_e32 v5, 0 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_load_dwordx2 v[3:4], v6, s[8:9] +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: v_mov_b32_e32 v1, v3 +; DEFAULT-NEXT: v_mov_b32_e32 v2, v4 +; DEFAULT-NEXT: s_and_saveexec_b64 s[0:1], vcc +; DEFAULT-NEXT: s_cbranch_execz .LBB11_4 +; DEFAULT-NEXT: ; %bb.1: ; %bb.1 +; DEFAULT-NEXT: global_load_dwordx2 v[1:2], v6, s[10:11] +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; DEFAULT-NEXT: s_and_saveexec_b64 s[2:3], vcc +; DEFAULT-NEXT: s_cbranch_execz .LBB11_3 +; DEFAULT-NEXT: ; %bb.2: ; %bb.2 +; DEFAULT-NEXT: v_mov_b32_e32 v0, 0 +; DEFAULT-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] +; DEFAULT-NEXT: .LBB11_3: ; %Flow +; DEFAULT-NEXT: s_or_b64 exec, exec, s[2:3] +; DEFAULT-NEXT: .LBB11_4: ; %bb.3 +; DEFAULT-NEXT: s_or_b64 exec, exec, s[0:1] +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: global_store_dwordx2 v5, v[1:2], s[14:15] +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -705,6 +1580,57 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_store_dword v1, v0, s[0:1] ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: v32i8_loop_carried: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; FEATURE-NEXT: v_lshlrev_b32_e32 v1, 5, v0 +; FEATURE-NEXT: v_cmp_lt_u32_e32 vcc, 14, v0 +; FEATURE-NEXT: s_mov_b32 s2, 0x2000604 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_load_dword v1, v1, s[0:1] +; FEATURE-NEXT: s_mov_b64 s[0:1], 0 +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: v_mov_b32_e32 v0, v1 +; FEATURE-NEXT: .LBB12_1: ; %bb.1 +; FEATURE-NEXT: ; =>This Inner Loop Header: Depth=1 +; FEATURE-NEXT: s_and_b64 s[6:7], exec, vcc +; FEATURE-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; FEATURE-NEXT: v_perm_b32 v0, v1, v0, s2 +; FEATURE-NEXT: s_andn2_b64 exec, exec, s[0:1] +; FEATURE-NEXT: s_cbranch_execnz .LBB12_1 +; FEATURE-NEXT: ; %bb.2: ; %bb.2.loopexit +; FEATURE-NEXT: s_or_b64 exec, exec, s[0:1] +; FEATURE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; FEATURE-NEXT: v_mov_b32_e32 v1, 0 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_store_dword v1, v0, s[0:1] +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: v32i8_loop_carried: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v1, 5, v0 +; DEFAULT-NEXT: v_cmp_lt_u32_e32 vcc, 14, v0 +; DEFAULT-NEXT: s_mov_b32 s2, 0x2000604 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_load_dword v1, v1, s[0:1] +; DEFAULT-NEXT: s_mov_b64 s[0:1], 0 +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: v_mov_b32_e32 v0, v1 +; DEFAULT-NEXT: .LBB12_1: ; %bb.1 +; DEFAULT-NEXT: ; =>This Inner Loop Header: Depth=1 +; DEFAULT-NEXT: s_and_b64 s[6:7], exec, vcc +; DEFAULT-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; DEFAULT-NEXT: v_perm_b32 v0, v1, v0, s2 +; DEFAULT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; DEFAULT-NEXT: s_cbranch_execnz .LBB12_1 +; DEFAULT-NEXT: ; %bb.2: ; %bb.2.loopexit +; DEFAULT-NEXT: s_or_b64 exec, exec, s[0:1] +; DEFAULT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; DEFAULT-NEXT: v_mov_b32_e32 v1, 0 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_store_dword v1, v0, s[0:1] +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -811,6 +1737,177 @@ define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr ; GFX906-NEXT: global_store_dword v0, v4, s[0:1] offset:16 ; GFX906-NEXT: global_store_dword v0, v2, s[0:1] offset:24 ; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: v8i8_multiuse_multiblock: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; FEATURE-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; FEATURE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; FEATURE-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: global_load_dwordx2 v[1:2], v1, s[8:9] +; FEATURE-NEXT: s_waitcnt vmcnt(0) +; FEATURE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; FEATURE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; FEATURE-NEXT: s_cbranch_execz .LBB13_2 +; FEATURE-NEXT: ; %bb.1: ; %bb.1 +; FEATURE-NEXT: s_movk_i32 s6, 0xff00 +; FEATURE-NEXT: v_mov_b32_e32 v5, 8 +; FEATURE-NEXT: v_and_b32_sdwa v6, v1, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; FEATURE-NEXT: s_mov_b32 s6, 0x6070504 +; FEATURE-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; FEATURE-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 +; FEATURE-NEXT: v_lshlrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; FEATURE-NEXT: v_perm_b32 v7, v1, v1, s6 +; FEATURE-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; FEATURE-NEXT: s_and_b64 s[6:7], vcc, exec +; FEATURE-NEXT: v_mov_b32_e32 v3, 0 +; FEATURE-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v6, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; FEATURE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; FEATURE-NEXT: v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; FEATURE-NEXT: global_store_dword v3, v1, s[12:13] +; FEATURE-NEXT: global_store_dword v3, v7, s[12:13] offset:8 +; FEATURE-NEXT: global_store_dword v3, v6, s[12:13] offset:16 +; FEATURE-NEXT: global_store_dword v3, v4, s[12:13] offset:24 +; FEATURE-NEXT: .LBB13_2: ; %Flow +; FEATURE-NEXT: s_or_b64 exec, exec, s[4:5] +; FEATURE-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; FEATURE-NEXT: s_cbranch_execz .LBB13_4 +; FEATURE-NEXT: ; %bb.3: ; %bb.2 +; FEATURE-NEXT: v_lshlrev_b16_e32 v3, 8, v2 +; FEATURE-NEXT: v_and_b32_e32 v4, 0xffffff00, v2 +; FEATURE-NEXT: v_and_b32_e32 v5, 0xffffff00, v1 +; FEATURE-NEXT: s_mov_b32 s2, 0xc0c0001 +; FEATURE-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_perm_b32 v2, 0, v2, s2 +; FEATURE-NEXT: v_mov_b32_e32 v0, 0 +; FEATURE-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; FEATURE-NEXT: v_perm_b32 v6, 0, v1, s2 +; FEATURE-NEXT: s_mov_b32 s3, 0xffff0000 +; FEATURE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; FEATURE-NEXT: v_and_or_b32 v7, v1, s3, v6 +; FEATURE-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_e32 v2, v6, v2 +; FEATURE-NEXT: global_store_dword v0, v3, s[14:15] +; FEATURE-NEXT: global_store_dword v0, v4, s[14:15] offset:8 +; FEATURE-NEXT: global_store_dword v0, v7, s[14:15] offset:16 +; FEATURE-NEXT: global_store_dword v0, v2, s[14:15] offset:24 +; FEATURE-NEXT: .LBB13_4: ; %bb.3 +; FEATURE-NEXT: s_or_b64 exec, exec, s[4:5] +; FEATURE-NEXT: s_movk_i32 s3, 0xff00 +; FEATURE-NEXT: v_mov_b32_e32 v4, 8 +; FEATURE-NEXT: s_movk_i32 s2, 0xff +; FEATURE-NEXT: v_and_b32_sdwa v2, v1, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; FEATURE-NEXT: v_lshlrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; FEATURE-NEXT: v_or_b32_sdwa v3, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v5, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; FEATURE-NEXT: v_lshlrev_b16_e32 v6, 8, v1 +; FEATURE-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; FEATURE-NEXT: v_mov_b32_e32 v0, 0 +; FEATURE-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v7, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; FEATURE-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; FEATURE-NEXT: global_store_dword v0, v3, s[0:1] +; FEATURE-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; FEATURE-NEXT: global_store_dword v0, v4, s[0:1] offset:16 +; FEATURE-NEXT: global_store_dword v0, v2, s[0:1] offset:24 +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: v8i8_multiuse_multiblock: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; DEFAULT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; DEFAULT-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: global_load_dwordx2 v[1:2], v1, s[8:9] +; DEFAULT-NEXT: s_waitcnt vmcnt(0) +; DEFAULT-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; DEFAULT-NEXT: s_and_saveexec_b64 s[4:5], vcc +; DEFAULT-NEXT: s_cbranch_execz .LBB13_2 +; DEFAULT-NEXT: ; %bb.1: ; %bb.1 +; DEFAULT-NEXT: s_movk_i32 s6, 0xff00 +; DEFAULT-NEXT: v_mov_b32_e32 v5, 8 +; DEFAULT-NEXT: v_and_b32_sdwa v6, v1, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; DEFAULT-NEXT: s_mov_b32 s6, 0x6070504 +; DEFAULT-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; DEFAULT-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 +; DEFAULT-NEXT: v_lshlrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; DEFAULT-NEXT: v_perm_b32 v7, v1, v1, s6 +; DEFAULT-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; DEFAULT-NEXT: s_and_b64 s[6:7], vcc, exec +; DEFAULT-NEXT: v_mov_b32_e32 v3, 0 +; DEFAULT-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v6, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; DEFAULT-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; DEFAULT-NEXT: v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; DEFAULT-NEXT: global_store_dword v3, v1, s[12:13] +; DEFAULT-NEXT: global_store_dword v3, v7, s[12:13] offset:8 +; DEFAULT-NEXT: global_store_dword v3, v6, s[12:13] offset:16 +; DEFAULT-NEXT: global_store_dword v3, v4, s[12:13] offset:24 +; DEFAULT-NEXT: .LBB13_2: ; %Flow +; DEFAULT-NEXT: s_or_b64 exec, exec, s[4:5] +; DEFAULT-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; DEFAULT-NEXT: s_cbranch_execz .LBB13_4 +; DEFAULT-NEXT: ; %bb.3: ; %bb.2 +; DEFAULT-NEXT: v_lshlrev_b16_e32 v3, 8, v2 +; DEFAULT-NEXT: v_and_b32_e32 v4, 0xffffff00, v2 +; DEFAULT-NEXT: v_and_b32_e32 v5, 0xffffff00, v1 +; DEFAULT-NEXT: s_mov_b32 s2, 0xc0c0001 +; DEFAULT-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_perm_b32 v2, 0, v2, s2 +; DEFAULT-NEXT: v_mov_b32_e32 v0, 0 +; DEFAULT-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; DEFAULT-NEXT: v_perm_b32 v6, 0, v1, s2 +; DEFAULT-NEXT: s_mov_b32 s3, 0xffff0000 +; DEFAULT-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; DEFAULT-NEXT: v_and_or_b32 v7, v1, s3, v6 +; DEFAULT-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_e32 v2, v6, v2 +; DEFAULT-NEXT: global_store_dword v0, v3, s[14:15] +; DEFAULT-NEXT: global_store_dword v0, v4, s[14:15] offset:8 +; DEFAULT-NEXT: global_store_dword v0, v7, s[14:15] offset:16 +; DEFAULT-NEXT: global_store_dword v0, v2, s[14:15] offset:24 +; DEFAULT-NEXT: .LBB13_4: ; %bb.3 +; DEFAULT-NEXT: s_or_b64 exec, exec, s[4:5] +; DEFAULT-NEXT: s_movk_i32 s3, 0xff00 +; DEFAULT-NEXT: v_mov_b32_e32 v4, 8 +; DEFAULT-NEXT: s_movk_i32 s2, 0xff +; DEFAULT-NEXT: v_and_b32_sdwa v2, v1, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; DEFAULT-NEXT: v_lshlrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; DEFAULT-NEXT: v_or_b32_sdwa v3, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v5, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; DEFAULT-NEXT: v_lshlrev_b16_e32 v6, 8, v1 +; DEFAULT-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; DEFAULT-NEXT: v_mov_b32_e32 v0, 0 +; DEFAULT-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v7, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; DEFAULT-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; DEFAULT-NEXT: global_store_dword v0, v3, s[0:1] +; DEFAULT-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; DEFAULT-NEXT: global_store_dword v0, v4, s[0:1] offset:16 +; DEFAULT-NEXT: global_store_dword v0, v2, s[0:1] offset:24 +; DEFAULT-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -866,5 +1963,137 @@ bb.3: ret void } +; This should not cause Assertion `getType() == V->getType() && "All operands to PHI node must be the same type as the PHI node +; Note: whether or not the assertion fires depends on the iteration ortder of PhiNodes in AMDGPULateCodeGenPrepare, which +; is non-deterministic due to iterators over a set of pointers. + +define amdgpu_kernel void @MissingInc_PhiChain(i1 %cmp1.i.i.i.i.i.not, <16 x i8> %promotealloca31.i.i.i.i) { +; GFX906-LABEL: MissingInc_PhiChain: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: s_bitcmp1_b32 s0, 0 +; GFX906-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX906-NEXT: s_xor_b64 s[0:1], s[2:3], -1 +; GFX906-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX906-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX906-NEXT: s_branch .LBB14_2 +; GFX906-NEXT: .LBB14_1: ; %Flow1 +; GFX906-NEXT: ; in Loop: Header=BB14_2 Depth=1 +; GFX906-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX906-NEXT: s_cbranch_vccnz .LBB14_6 +; GFX906-NEXT: .LBB14_2: ; %for.body10.i.i.i.i +; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX906-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX906-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX906-NEXT: s_cbranch_vccnz .LBB14_4 +; GFX906-NEXT: ; %bb.3: ; %if.then.i.i.i.i.i +; GFX906-NEXT: ; in Loop: Header=BB14_2 Depth=1 +; GFX906-NEXT: s_mov_b64 s[4:5], -1 +; GFX906-NEXT: .LBB14_4: ; %Flow +; GFX906-NEXT: ; in Loop: Header=BB14_2 Depth=1 +; GFX906-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX906-NEXT: s_mov_b64 s[4:5], -1 +; GFX906-NEXT: s_cbranch_vccnz .LBB14_1 +; GFX906-NEXT: ; %bb.5: ; %if.end.i.i.i.i.i +; GFX906-NEXT: ; in Loop: Header=BB14_2 Depth=1 +; GFX906-NEXT: s_mov_b64 s[4:5], 0 +; GFX906-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX906-NEXT: s_branch .LBB14_1 +; GFX906-NEXT: .LBB14_6: ; %DummyReturnBlock +; GFX906-NEXT: s_endpgm +; FEATURE-LABEL: MissingInc_PhiChain: +; FEATURE: ; %bb.0: ; %entry +; FEATURE-NEXT: s_load_dword s0, s[4:5], 0x24 +; FEATURE-NEXT: s_waitcnt lgkmcnt(0) +; FEATURE-NEXT: s_bitcmp1_b32 s0, 0 +; FEATURE-NEXT: s_cselect_b64 s[2:3], -1, 0 +; FEATURE-NEXT: s_xor_b64 s[0:1], s[2:3], -1 +; FEATURE-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; FEATURE-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; FEATURE-NEXT: s_branch .LBB14_2 +; FEATURE-NEXT: .LBB14_1: ; %Flow1 +; FEATURE-NEXT: ; in Loop: Header=BB14_2 Depth=1 +; FEATURE-NEXT: s_and_b64 vcc, exec, s[4:5] +; FEATURE-NEXT: s_cbranch_vccnz .LBB14_6 +; FEATURE-NEXT: .LBB14_2: ; %for.body10.i.i.i.i +; FEATURE-NEXT: ; =>This Inner Loop Header: Depth=1 +; FEATURE-NEXT: s_and_b64 vcc, exec, s[0:1] +; FEATURE-NEXT: s_mov_b64 s[4:5], s[2:3] +; FEATURE-NEXT: s_cbranch_vccnz .LBB14_4 +; FEATURE-NEXT: ; %bb.3: ; %if.then.i.i.i.i.i +; FEATURE-NEXT: ; in Loop: Header=BB14_2 Depth=1 +; FEATURE-NEXT: s_mov_b64 s[4:5], -1 +; FEATURE-NEXT: .LBB14_4: ; %Flow +; FEATURE-NEXT: ; in Loop: Header=BB14_2 Depth=1 +; FEATURE-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; FEATURE-NEXT: s_mov_b64 s[4:5], -1 +; FEATURE-NEXT: s_cbranch_vccnz .LBB14_1 +; FEATURE-NEXT: ; %bb.5: ; %if.end.i.i.i.i.i +; FEATURE-NEXT: ; in Loop: Header=BB14_2 Depth=1 +; FEATURE-NEXT: s_mov_b64 s[4:5], 0 +; FEATURE-NEXT: s_and_b64 vcc, exec, s[0:1] +; FEATURE-NEXT: s_branch .LBB14_1 +; FEATURE-NEXT: .LBB14_6: ; %DummyReturnBlock +; FEATURE-NEXT: s_endpgm +; +; DEFAULT-LABEL: MissingInc_PhiChain: +; DEFAULT: ; %bb.0: ; %entry +; DEFAULT-NEXT: s_load_dword s0, s[4:5], 0x24 +; DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; DEFAULT-NEXT: s_bitcmp1_b32 s0, 0 +; DEFAULT-NEXT: s_cselect_b64 s[2:3], -1, 0 +; DEFAULT-NEXT: s_xor_b64 s[0:1], s[2:3], -1 +; DEFAULT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; DEFAULT-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; DEFAULT-NEXT: s_branch .LBB14_2 +; DEFAULT-NEXT: .LBB14_1: ; %Flow1 +; DEFAULT-NEXT: ; in Loop: Header=BB14_2 Depth=1 +; DEFAULT-NEXT: s_and_b64 vcc, exec, s[4:5] +; DEFAULT-NEXT: s_cbranch_vccnz .LBB14_6 +; DEFAULT-NEXT: .LBB14_2: ; %for.body10.i.i.i.i +; DEFAULT-NEXT: ; =>This Inner Loop Header: Depth=1 +; DEFAULT-NEXT: s_and_b64 vcc, exec, s[0:1] +; DEFAULT-NEXT: s_mov_b64 s[4:5], s[2:3] +; DEFAULT-NEXT: s_cbranch_vccnz .LBB14_4 +; DEFAULT-NEXT: ; %bb.3: ; %if.then.i.i.i.i.i +; DEFAULT-NEXT: ; in Loop: Header=BB14_2 Depth=1 +; DEFAULT-NEXT: s_mov_b64 s[4:5], -1 +; DEFAULT-NEXT: .LBB14_4: ; %Flow +; DEFAULT-NEXT: ; in Loop: Header=BB14_2 Depth=1 +; DEFAULT-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; DEFAULT-NEXT: s_mov_b64 s[4:5], -1 +; DEFAULT-NEXT: s_cbranch_vccnz .LBB14_1 +; DEFAULT-NEXT: ; %bb.5: ; %if.end.i.i.i.i.i +; DEFAULT-NEXT: ; in Loop: Header=BB14_2 Depth=1 +; DEFAULT-NEXT: s_mov_b64 s[4:5], 0 +; DEFAULT-NEXT: s_and_b64 vcc, exec, s[0:1] +; DEFAULT-NEXT: s_branch .LBB14_1 +; DEFAULT-NEXT: .LBB14_6: ; %DummyReturnBlock +; DEFAULT-NEXT: s_endpgm +entry: + br label %for.body10.i.i.i.i + +for.body10.i.i.i.i: ; preds = %if.end.1.i.i.i.i.i, %entry + %promotealloca3237.i.i.i.i = phi <16 x i8> [ , %entry ], [ %1, %if.end.1.i.i.i.i.i ] + br i1 %cmp1.i.i.i.i.i.not, label %if.end.i.i.i.i.i, label %if.then.i.i.i.i.i + +if.then.i.i.i.i.i: ; preds = %for.body10.i.i.i.i + %0 = insertelement <16 x i8> %promotealloca3237.i.i.i.i, i8 0, i64 0 + br label %if.end.i.i.i.i.i + +if.end.i.i.i.i.i: ; preds = %if.then.i.i.i.i.i, %for.body10.i.i.i.i + %promotealloca31.i.i.i.i3 = phi <16 x i8> [ %0, %if.then.i.i.i.i.i ], [ %promotealloca3237.i.i.i.i, %for.body10.i.i.i.i ] + br i1 %cmp1.i.i.i.i.i.not, label %if.end.1.i.i.i.i.i, label %if.then.1.i.i.i.i.i + +if.then.1.i.i.i.i.i: ; preds = %if.end.i.i.i.i.i + br label %if.end.1.i.i.i.i.i + +if.end.1.i.i.i.i.i: ; preds = %if.then.1.i.i.i.i.i, %if.end.i.i.i.i.i + %promotealloca30.i.i.i.i = phi <16 x i8> [ %promotealloca31.i.i.i.i, %if.then.1.i.i.i.i.i ], [ %promotealloca31.i.i.i.i3, %if.end.i.i.i.i.i ] + %1 = shufflevector <16 x i8> %promotealloca30.i.i.i.i, <16 x i8> zeroinitializer, <16 x i32> + br label %for.body10.i.i.i.i +} + declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll index 29996d68040e7..c1fe29ad2218e 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -mtriple=amdgcn -mcpu=gfx906 -amdgpu-late-codegenprepare -S -o - %s | FileCheck --check-prefix=GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-late-codegenprepare -amdgpu-coerce-illegal-types=1 < %s | FileCheck --check-prefix=FEATURE %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-late-codegenprepare < %s | FileCheck --check-prefix=DEFAULT %s define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: define amdgpu_kernel void @v3i8_liveout( @@ -25,6 +26,56 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: store <3 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4 ; GFX906-NEXT: ret void ; +; FEATURE-LABEL: define amdgpu_kernel void @v3i8_liveout( +; FEATURE-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0:[0-9]+]] { +; FEATURE-NEXT: entry: +; FEATURE-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; FEATURE-NEXT: [[GEP1:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC1:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP1]], align 4 +; FEATURE-NEXT: [[TMP0:%.*]] = shufflevector <3 x i8> [[VEC1]], <3 x i8> poison, <4 x i32> +; FEATURE-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[TMP0]] to i32 +; FEATURE-NEXT: [[GEP2:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC2:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP2]], align 4 +; FEATURE-NEXT: [[TMP1:%.*]] = shufflevector <3 x i8> [[VEC2]], <3 x i8> poison, <4 x i32> +; FEATURE-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[TMP1]] to i32 +; FEATURE-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; FEATURE: bb.1: +; FEATURE-NEXT: br label [[BB_2]] +; FEATURE: bb.2: +; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC2:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[TMP2:%.*]] = trunc i32 [[PHI5_TC]] to i24 +; FEATURE-NEXT: [[TMP3:%.*]] = bitcast i24 [[TMP2]] to <3 x i8> +; FEATURE-NEXT: store <3 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4 +; FEATURE-NEXT: ret void +; +; DEFAULT-LABEL: define amdgpu_kernel void @v3i8_liveout( +; DEFAULT-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0:[0-9]+]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[GEP1:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC1:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP1]], align 4 +; DEFAULT-NEXT: [[TMP0:%.*]] = shufflevector <3 x i8> [[VEC1]], <3 x i8> poison, <4 x i32> +; DEFAULT-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[TMP0]] to i32 +; DEFAULT-NEXT: [[GEP2:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC2:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP2]], align 4 +; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <3 x i8> [[VEC2]], <3 x i8> poison, <4 x i32> +; DEFAULT-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[TMP1]] to i32 +; DEFAULT-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; DEFAULT: bb.1: +; DEFAULT-NEXT: br label [[BB_2]] +; DEFAULT: bb.2: +; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5_TC2:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[TMP2:%.*]] = trunc i32 [[PHI5_TC]] to i24 +; DEFAULT-NEXT: [[PHI5:%.*]] = bitcast i24 [[TMP2]] to <3 x i8> +; DEFAULT-NEXT: store <3 x i8> [[PHI5]], ptr addrspace(1) [[DST]], align 4 +; DEFAULT-NEXT: ret void +; entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -37,8 +88,8 @@ bb.1: br label %bb.2 bb.2: - %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] - store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + %phi5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <3 x i8> %phi5, ptr addrspace(1) %dst, align 4 ret void } @@ -63,6 +114,50 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4 ; GFX906-NEXT: ret void ; +; FEATURE-LABEL: define amdgpu_kernel void @v4i8_liveout( +; FEATURE-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; FEATURE-NEXT: entry: +; FEATURE-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; FEATURE-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4 +; FEATURE-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32 +; FEATURE-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4 +; FEATURE-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32 +; FEATURE-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; FEATURE: bb.1: +; FEATURE-NEXT: br label [[BB_2]] +; FEATURE: bb.2: +; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC2:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC_BC:%.*]] = bitcast i32 [[PHI5_TC]] to <4 x i8> +; FEATURE-NEXT: store <4 x i8> [[PHI5_TC_BC]], ptr addrspace(1) [[DST]], align 4 +; FEATURE-NEXT: ret void +; +; DEFAULT-LABEL: define amdgpu_kernel void @v4i8_liveout( +; DEFAULT-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4 +; DEFAULT-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32 +; DEFAULT-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4 +; DEFAULT-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32 +; DEFAULT-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; DEFAULT: bb.1: +; DEFAULT-NEXT: br label [[BB_2]] +; DEFAULT: bb.2: +; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5_TC2:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5:%.*]] = bitcast i32 [[PHI5_TC]] to <4 x i8> +; DEFAULT-NEXT: store <4 x i8> [[PHI5]], ptr addrspace(1) [[DST]], align 4 +; DEFAULT-NEXT: ret void +; entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -75,8 +170,8 @@ bb.1: br label %bb.2 bb.2: - %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] - store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + %phi5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <4 x i8> %phi5, ptr addrspace(1) %dst, align 4 ret void } @@ -104,6 +199,56 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: store <5 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4 ; GFX906-NEXT: ret void ; +; FEATURE-LABEL: define amdgpu_kernel void @v5i8_liveout( +; FEATURE-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; FEATURE-NEXT: entry: +; FEATURE-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; FEATURE-NEXT: [[GEP1:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC1:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP1]], align 8 +; FEATURE-NEXT: [[TMP0:%.*]] = shufflevector <5 x i8> [[VEC1]], <5 x i8> poison, <8 x i32> +; FEATURE-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +; FEATURE-NEXT: [[GEP2:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC2:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP2]], align 8 +; FEATURE-NEXT: [[TMP1:%.*]] = shufflevector <5 x i8> [[VEC2]], <5 x i8> poison, <8 x i32> +; FEATURE-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +; FEATURE-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; FEATURE: bb.1: +; FEATURE-NEXT: br label [[BB_2]] +; FEATURE: bb.2: +; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC2:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8> +; FEATURE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <5 x i32> +; FEATURE-NEXT: store <5 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4 +; FEATURE-NEXT: ret void +; +; DEFAULT-LABEL: define amdgpu_kernel void @v5i8_liveout( +; DEFAULT-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[GEP1:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC1:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP1]], align 8 +; DEFAULT-NEXT: [[TMP0:%.*]] = shufflevector <5 x i8> [[VEC1]], <5 x i8> poison, <8 x i32> +; DEFAULT-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +; DEFAULT-NEXT: [[GEP2:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC2:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP2]], align 8 +; DEFAULT-NEXT: [[TMP1:%.*]] = shufflevector <5 x i8> [[VEC2]], <5 x i8> poison, <8 x i32> +; DEFAULT-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +; DEFAULT-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; DEFAULT: bb.1: +; DEFAULT-NEXT: br label [[BB_2]] +; DEFAULT: bb.2: +; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5_TC2:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8> +; DEFAULT-NEXT: [[PHI5:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <5 x i32> +; DEFAULT-NEXT: store <5 x i8> [[PHI5]], ptr addrspace(1) [[DST]], align 4 +; DEFAULT-NEXT: ret void +; entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -116,8 +261,8 @@ bb.1: br label %bb.2 bb.2: - %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] - store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + %phi5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <5 x i8> %phi5, ptr addrspace(1) %dst, align 4 ret void } @@ -142,6 +287,50 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4 ; GFX906-NEXT: ret void ; +; FEATURE-LABEL: define amdgpu_kernel void @v8i8_liveout( +; FEATURE-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; FEATURE-NEXT: entry: +; FEATURE-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; FEATURE-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8 +; FEATURE-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32> +; FEATURE-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8 +; FEATURE-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32> +; FEATURE-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; FEATURE: bb.1: +; FEATURE-NEXT: br label [[BB_2]] +; FEATURE: bb.2: +; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC2:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC_BC:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8> +; FEATURE-NEXT: store <8 x i8> [[PHI5_TC_BC]], ptr addrspace(1) [[DST]], align 4 +; FEATURE-NEXT: ret void +; +; DEFAULT-LABEL: define amdgpu_kernel void @v8i8_liveout( +; DEFAULT-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8 +; DEFAULT-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32> +; DEFAULT-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8 +; DEFAULT-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32> +; DEFAULT-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; DEFAULT: bb.1: +; DEFAULT-NEXT: br label [[BB_2]] +; DEFAULT: bb.2: +; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5_TC2:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8> +; DEFAULT-NEXT: store <8 x i8> [[PHI5]], ptr addrspace(1) [[DST]], align 4 +; DEFAULT-NEXT: ret void +; entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -154,8 +343,8 @@ bb.1: br label %bb.2 bb.2: - %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] - store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + %phi5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <8 x i8> %phi5, ptr addrspace(1) %dst, align 4 ret void } @@ -185,6 +374,60 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr ; GFX906: return: ; GFX906-NEXT: ret void ; +; FEATURE-LABEL: define amdgpu_kernel void @repeat_successor( +; FEATURE-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; FEATURE-NEXT: entry: +; FEATURE-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; FEATURE-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4 +; FEATURE-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32 +; FEATURE-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4 +; FEATURE-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32 +; FEATURE-NEXT: switch i32 [[IN]], label [[RETURN:%.*]] [ +; FEATURE-NEXT: i32 1, label [[RETURN_SINK_SPLIT:%.*]] +; FEATURE-NEXT: i32 2, label [[RETURN_SINK_SPLIT]] +; FEATURE-NEXT: i32 3, label [[SW_BB5:%.*]] +; FEATURE-NEXT: ] +; FEATURE: sw.bb5: +; FEATURE-NEXT: br label [[RETURN_SINK_SPLIT]] +; FEATURE: return.sink.split: +; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC1_BC]], [[ENTRY]] ] +; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC1_BC]], [[ENTRY]] ] +; FEATURE-NEXT: [[PHI5_TC2:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC1_BC]], [[ENTRY]] ] +; FEATURE-NEXT: [[PHI5_TC_BC:%.*]] = bitcast i32 [[PHI5_TC]] to <4 x i8> +; FEATURE-NEXT: store <4 x i8> [[PHI5_TC_BC]], ptr addrspace(1) [[DST]], align 4 +; FEATURE-NEXT: ret void +; FEATURE: return: +; FEATURE-NEXT: ret void +; +; DEFAULT-LABEL: define amdgpu_kernel void @repeat_successor( +; DEFAULT-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4 +; DEFAULT-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32 +; DEFAULT-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4 +; DEFAULT-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32 +; DEFAULT-NEXT: switch i32 [[IN]], label [[RETURN:%.*]] [ +; DEFAULT-NEXT: i32 1, label [[RETURN_SINK_SPLIT:%.*]] +; DEFAULT-NEXT: i32 2, label [[RETURN_SINK_SPLIT]] +; DEFAULT-NEXT: i32 3, label [[SW_BB5:%.*]] +; DEFAULT-NEXT: ] +; DEFAULT: sw.bb5: +; DEFAULT-NEXT: br label [[RETURN_SINK_SPLIT]] +; DEFAULT: return.sink.split: +; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC1_BC]], [[ENTRY]] ] +; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC1_BC]], [[ENTRY]] ] +; DEFAULT-NEXT: [[PHI5_TC2:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC1_BC]], [[ENTRY]] ] +; DEFAULT-NEXT: [[PHI5:%.*]] = bitcast i32 [[PHI5_TC]] to <4 x i8> +; DEFAULT-NEXT: store <4 x i8> [[PHI5]], ptr addrspace(1) [[DST]], align 4 +; DEFAULT-NEXT: ret void +; DEFAULT: return: +; DEFAULT-NEXT: ret void +; entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -201,8 +444,8 @@ sw.bb5: br label %return.sink.split return.sink.split: - %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ] - store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4 + %phi5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ] + store <4 x i8> %phi5, ptr addrspace(1) %dst, align 4 ret void return: @@ -236,6 +479,70 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: store <8 x i8> [[TMP7_TC_BC]], ptr addrspace(1) [[DST1]], align 4 ; GFX906-NEXT: ret void ; +; FEATURE-LABEL: define amdgpu_kernel void @v8i8_phi_chain( +; FEATURE-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] { +; FEATURE-NEXT: entry: +; FEATURE-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; FEATURE-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8 +; FEATURE-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32> +; FEATURE-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8 +; FEATURE-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32> +; FEATURE-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; FEATURE: bb.1: +; FEATURE-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7 +; FEATURE-NEXT: br i1 [[CMP2]], label [[BB_2]], label [[BB_3:%.*]] +; FEATURE: bb.2: +; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC3:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC5:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[PHI5_TC_BC:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8> +; FEATURE-NEXT: store <8 x i8> [[PHI5_TC_BC]], ptr addrspace(1) [[DST0]], align 4 +; FEATURE-NEXT: br label [[BB_3]] +; FEATURE: bb.3: +; FEATURE-NEXT: [[PHI7_TC:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC]], [[BB_2]] ] +; FEATURE-NEXT: [[PHI7_TC2:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC1]], [[BB_2]] ] +; FEATURE-NEXT: [[PHI7_TC4:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC3]], [[BB_2]] ] +; FEATURE-NEXT: [[PHI7_TC6:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC5]], [[BB_2]] ] +; FEATURE-NEXT: [[PHI7_TC_BC:%.*]] = bitcast <2 x i32> [[PHI7_TC]] to <8 x i8> +; FEATURE-NEXT: store <8 x i8> [[PHI7_TC_BC]], ptr addrspace(1) [[DST1]], align 4 +; FEATURE-NEXT: ret void +; +; DEFAULT-LABEL: define amdgpu_kernel void @v8i8_phi_chain( +; DEFAULT-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8 +; DEFAULT-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32> +; DEFAULT-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8 +; DEFAULT-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32> +; DEFAULT-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] +; DEFAULT: bb.1: +; DEFAULT-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7 +; DEFAULT-NEXT: br i1 [[CMP2]], label [[BB_2]], label [[BB_3:%.*]] +; DEFAULT: bb.2: +; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5_TC3:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5_TC5:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[PHI5:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8> +; DEFAULT-NEXT: store <8 x i8> [[PHI5]], ptr addrspace(1) [[DST0]], align 4 +; DEFAULT-NEXT: br label [[BB_3]] +; DEFAULT: bb.3: +; DEFAULT-NEXT: [[PHI7_TC:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC]], [[BB_2]] ] +; DEFAULT-NEXT: [[PHI7_TC2:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC1]], [[BB_2]] ] +; DEFAULT-NEXT: [[PHI7_TC4:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC3]], [[BB_2]] ] +; DEFAULT-NEXT: [[PHI7_TC6:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC5]], [[BB_2]] ] +; DEFAULT-NEXT: [[PHI7:%.*]] = bitcast <2 x i32> [[PHI7_TC]] to <8 x i8> +; DEFAULT-NEXT: store <8 x i8> [[PHI7]], ptr addrspace(1) [[DST1]], align 4 +; DEFAULT-NEXT: ret void +; entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -249,13 +556,13 @@ bb.1: br i1 %cmp2, label %bb.2, label %bb.3 bb.2: - %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] - store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4 + %phi5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + store <8 x i8> %phi5, ptr addrspace(1) %dst0, align 4 br label %bb.3 bb.3: - %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2] - store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4 + %phi7 = phi <8 x i8> [ %vec2, %bb.1], [%phi5, %bb.2] + store <8 x i8> %phi7, ptr addrspace(1) %dst1, align 4 ret void } @@ -285,6 +592,60 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac ; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST1]], align 4 ; GFX906-NEXT: ret void ; +; FEATURE-LABEL: define amdgpu_kernel void @v8i8_multi_block( +; FEATURE-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] { +; FEATURE-NEXT: entry: +; FEATURE-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; FEATURE-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8 +; FEATURE-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32> +; FEATURE-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8 +; FEATURE-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32> +; FEATURE-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_3:%.*]] +; FEATURE: bb.1: +; FEATURE-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7 +; FEATURE-NEXT: br i1 [[CMP2]], label [[BB_2:%.*]], label [[BB_3]] +; FEATURE: bb.2: +; FEATURE-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <2 x i32> [[VEC1_BC]] to <8 x i8> +; FEATURE-NEXT: store <8 x i8> [[VEC1_BC_BC]], ptr addrspace(1) [[DST0]], align 4 +; FEATURE-NEXT: br label [[BB_3]] +; FEATURE: bb.3: +; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ] +; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ] +; FEATURE-NEXT: [[PHI5_TC2:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ] +; FEATURE-NEXT: [[PHI5_TC_BC:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8> +; FEATURE-NEXT: store <8 x i8> [[PHI5_TC_BC]], ptr addrspace(1) [[DST1]], align 4 +; FEATURE-NEXT: ret void +; +; DEFAULT-LABEL: define amdgpu_kernel void @v8i8_multi_block( +; DEFAULT-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8 +; DEFAULT-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32> +; DEFAULT-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8 +; DEFAULT-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32> +; DEFAULT-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_3:%.*]] +; DEFAULT: bb.1: +; DEFAULT-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7 +; DEFAULT-NEXT: br i1 [[CMP2]], label [[BB_2:%.*]], label [[BB_3]] +; DEFAULT: bb.2: +; DEFAULT-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <2 x i32> [[VEC1_BC]] to <8 x i8> +; DEFAULT-NEXT: store <8 x i8> [[VEC1_BC_BC]], ptr addrspace(1) [[DST0]], align 4 +; DEFAULT-NEXT: br label [[BB_3]] +; DEFAULT: bb.3: +; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ] +; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ] +; DEFAULT-NEXT: [[PHI5_TC2:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ] +; DEFAULT-NEXT: [[PHI5:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8> +; DEFAULT-NEXT: store <8 x i8> [[PHI5]], ptr addrspace(1) [[DST1]], align 4 +; DEFAULT-NEXT: ret void +; entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -302,8 +663,8 @@ bb.2: br label %bb.3 bb.3: - %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2] - store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4 + %phi5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2] + store <8 x i8> %phi5, ptr addrspace(1) %dst1, align 4 ret void } @@ -331,6 +692,56 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp ; GFX906-NEXT: store <4 x i8> [[VEC2_BC_BC]], ptr addrspace(1) [[DST]], align 4 ; GFX906-NEXT: ret void ; +; FEATURE-LABEL: define amdgpu_kernel void @v32i8_loop_carried( +; FEATURE-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; FEATURE-NEXT: entry: +; FEATURE-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; FEATURE-NEXT: [[GEP1:%.*]] = getelementptr <32 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; FEATURE-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4 +; FEATURE-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32 +; FEATURE-NEXT: br label [[BB_1:%.*]] +; FEATURE: bb.1: +; FEATURE-NEXT: [[TEMP_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC:%.*]], [[BB_1]] ] +; FEATURE-NEXT: [[TEMP_TC1:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[TEMP_TC2:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; FEATURE-NEXT: [[TEMP_TC_BC:%.*]] = bitcast i32 [[TEMP_TC]] to <4 x i8> +; FEATURE-NEXT: [[VEC1_BC_BC:%.*]] = bitcast i32 [[VEC1_BC]] to <4 x i8> +; FEATURE-NEXT: [[VEC2:%.*]] = shufflevector <4 x i8> [[VEC1_BC_BC]], <4 x i8> [[TEMP_TC_BC]], <4 x i32> +; FEATURE-NEXT: [[VEC2_BC]] = bitcast <4 x i8> [[VEC2]] to i32 +; FEATURE-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_1]], label [[BB_2:%.*]] +; FEATURE: 0: +; FEATURE-NEXT: br label [[BB_2]] +; FEATURE: bb.2: +; FEATURE-NEXT: [[VEC2_BC_BC:%.*]] = bitcast i32 [[VEC2_BC]] to <4 x i8> +; FEATURE-NEXT: store <4 x i8> [[VEC2_BC_BC]], ptr addrspace(1) [[DST]], align 4 +; FEATURE-NEXT: ret void +; +; DEFAULT-LABEL: define amdgpu_kernel void @v32i8_loop_carried( +; DEFAULT-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[GEP1:%.*]] = getelementptr <32 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] +; DEFAULT-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4 +; DEFAULT-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32 +; DEFAULT-NEXT: br label [[BB_1:%.*]] +; DEFAULT: bb.1: +; DEFAULT-NEXT: [[TEMP_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC:%.*]], [[BB_1]] ] +; DEFAULT-NEXT: [[TEMP_TC1:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[TEMP_TC2:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ] +; DEFAULT-NEXT: [[TEMP_TC_BC:%.*]] = bitcast i32 [[TEMP_TC]] to <4 x i8> +; DEFAULT-NEXT: [[VEC1_BC_BC:%.*]] = bitcast i32 [[VEC1_BC]] to <4 x i8> +; DEFAULT-NEXT: [[VEC3:%.*]] = shufflevector <4 x i8> [[VEC1_BC_BC]], <4 x i8> [[TEMP_TC_BC]], <4 x i32> +; DEFAULT-NEXT: [[VEC2_BC]] = bitcast <4 x i8> [[VEC3]] to i32 +; DEFAULT-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_1]], label [[BB_2:%.*]] +; DEFAULT: 0: +; DEFAULT-NEXT: br label [[BB_2]] +; DEFAULT: bb.2: +; DEFAULT-NEXT: [[VEC2:%.*]] = bitcast i32 [[VEC2_BC]] to <4 x i8> +; DEFAULT-NEXT: store <4 x i8> [[VEC2]], ptr addrspace(1) [[DST]], align 4 +; DEFAULT-NEXT: ret void +; entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -371,6 +782,44 @@ define void @broken_phi() { ; GFX906-NEXT: [[I8]] = phi <4 x i8> [ zeroinitializer, [[BB5]] ], [ zeroinitializer, [[BB3]] ] ; GFX906-NEXT: br label [[BB1]] ; +; FEATURE-LABEL: define void @broken_phi( +; FEATURE-SAME: ) #[[ATTR0]] { +; FEATURE-NEXT: bb: +; FEATURE-NEXT: br label [[BB1:%.*]] +; FEATURE: bb1: +; FEATURE-NEXT: [[I:%.*]] = phi <4 x i8> [ splat (i8 1), [[BB:%.*]] ], [ [[I8:%.*]], [[BB7:%.*]] ] +; FEATURE-NEXT: br i1 false, label [[BB3:%.*]], label [[BB2:%.*]] +; FEATURE: bb2: +; FEATURE-NEXT: br label [[BB3]] +; FEATURE: bb3: +; FEATURE-NEXT: [[I4:%.*]] = phi <4 x i8> [ zeroinitializer, [[BB2]] ], [ [[I]], [[BB1]] ] +; FEATURE-NEXT: br i1 false, label [[BB7]], label [[BB5:%.*]] +; FEATURE: bb5: +; FEATURE-NEXT: [[I6:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[I4]], <4 x i8> zeroinitializer) +; FEATURE-NEXT: br label [[BB7]] +; FEATURE: bb7: +; FEATURE-NEXT: [[I8]] = phi <4 x i8> [ zeroinitializer, [[BB5]] ], [ zeroinitializer, [[BB3]] ] +; FEATURE-NEXT: br label [[BB1]] +; +; DEFAULT-LABEL: define void @broken_phi( +; DEFAULT-SAME: ) #[[ATTR0]] { +; DEFAULT-NEXT: bb: +; DEFAULT-NEXT: br label [[BB1:%.*]] +; DEFAULT: bb1: +; DEFAULT-NEXT: [[I:%.*]] = phi <4 x i8> [ splat (i8 1), [[BB:%.*]] ], [ [[I8:%.*]], [[BB7:%.*]] ] +; DEFAULT-NEXT: br i1 false, label [[BB3:%.*]], label [[BB2:%.*]] +; DEFAULT: bb2: +; DEFAULT-NEXT: br label [[BB3]] +; DEFAULT: bb3: +; DEFAULT-NEXT: [[I4:%.*]] = phi <4 x i8> [ zeroinitializer, [[BB2]] ], [ [[I]], [[BB1]] ] +; DEFAULT-NEXT: br i1 false, label [[BB7]], label [[BB5:%.*]] +; DEFAULT: bb5: +; DEFAULT-NEXT: [[I6:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[I4]], <4 x i8> zeroinitializer) +; DEFAULT-NEXT: br label [[BB7]] +; DEFAULT: bb7: +; DEFAULT-NEXT: [[I8]] = phi <4 x i8> [ zeroinitializer, [[BB5]] ], [ zeroinitializer, [[BB3]] ] +; DEFAULT-NEXT: br label [[BB1]] +; bb: br label %bb1 bb1: @@ -410,6 +859,40 @@ define amdgpu_kernel void @reuseOp() { ; GFX906-NEXT: [[VAL:%.*]] = extractelement <16 x i8> [[SEL0_BC_BC]], i64 0 ; GFX906-NEXT: ret void ; +; FEATURE-LABEL: define amdgpu_kernel void @reuseOp( +; FEATURE-SAME: ) #[[ATTR0]] { +; FEATURE-NEXT: entry: +; FEATURE-NEXT: [[VEC1:%.*]] = insertelement <16 x i8> zeroinitializer, i8 0, i64 0 +; FEATURE-NEXT: [[VEC1_BC:%.*]] = bitcast <16 x i8> [[VEC1]] to <4 x i32> +; FEATURE-NEXT: br label [[BB_1:%.*]] +; FEATURE: bb.1: +; FEATURE-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <4 x i32> [[VEC1_BC]] to <16 x i8> +; FEATURE-NEXT: [[SEL0:%.*]] = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer +; FEATURE-NEXT: [[SEL0_BC:%.*]] = bitcast <16 x i8> [[SEL0]] to <4 x i32> +; FEATURE-NEXT: [[SEL1:%.*]] = select i1 false, <16 x i8> [[VEC1_BC_BC]], <16 x i8> [[SEL0]] +; FEATURE-NEXT: br label [[BB_2:%.*]] +; FEATURE: bb.2: +; FEATURE-NEXT: [[SEL0_BC_BC:%.*]] = bitcast <4 x i32> [[SEL0_BC]] to <16 x i8> +; FEATURE-NEXT: [[VAL:%.*]] = extractelement <16 x i8> [[SEL0_BC_BC]], i64 0 +; FEATURE-NEXT: ret void +; +; DEFAULT-LABEL: define amdgpu_kernel void @reuseOp( +; DEFAULT-SAME: ) #[[ATTR0]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: [[VEC1:%.*]] = insertelement <16 x i8> zeroinitializer, i8 0, i64 0 +; DEFAULT-NEXT: [[VEC1_BC:%.*]] = bitcast <16 x i8> [[VEC1]] to <4 x i32> +; DEFAULT-NEXT: br label [[BB_1:%.*]] +; DEFAULT: bb.1: +; DEFAULT-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <4 x i32> [[VEC1_BC]] to <16 x i8> +; DEFAULT-NEXT: [[SEL0:%.*]] = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer +; DEFAULT-NEXT: [[SEL0_BC:%.*]] = bitcast <16 x i8> [[SEL0]] to <4 x i32> +; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 false, <16 x i8> [[VEC1_BC_BC]], <16 x i8> [[SEL0]] +; DEFAULT-NEXT: br label [[BB_2:%.*]] +; DEFAULT: bb.2: +; DEFAULT-NEXT: [[SEL0_BC_BC:%.*]] = bitcast <4 x i32> [[SEL0_BC]] to <16 x i8> +; DEFAULT-NEXT: [[VAL:%.*]] = extractelement <16 x i8> [[SEL0_BC_BC]], i64 0 +; DEFAULT-NEXT: ret void +; entry: %vec1 = insertelement <16 x i8> zeroinitializer, i8 0, i64 0 br label %bb.1 @@ -424,7 +907,6 @@ bb.2: ret void } - define amdgpu_kernel void @deletedPHI(i32 %in0, i1 %cmp, <10 x i8> %invec0) { ; GFX906-LABEL: define amdgpu_kernel void @deletedPHI( ; GFX906-SAME: i32 [[IN0:%.*]], i1 [[CMP:%.*]], <10 x i8> [[INVEC0:%.*]]) #[[ATTR0]] { @@ -462,6 +944,78 @@ define amdgpu_kernel void @deletedPHI(i32 %in0, i1 %cmp, <10 x i8> %invec0) { ; GFX906-NEXT: [[VEC1]] = shufflevector <10 x i8> [[PHI6]], <10 x i8> zeroinitializer, <10 x i32> ; GFX906-NEXT: br label [[BB_1]] ; +; FEATURE-LABEL: define amdgpu_kernel void @deletedPHI( +; FEATURE-SAME: i32 [[IN0:%.*]], i1 [[CMP:%.*]], <10 x i8> [[INVEC0:%.*]]) #[[ATTR0]] { +; FEATURE-NEXT: entry: +; FEATURE-NEXT: br label [[BB_1:%.*]] +; FEATURE: bb.1: +; FEATURE-NEXT: [[PHI0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[BB_11:%.*]] ] +; FEATURE-NEXT: [[PHI1:%.*]] = phi <10 x i8> [ splat (i8 1), [[ENTRY]] ], [ [[VEC1:%.*]], [[BB_11]] ] +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_3:%.*]], label [[BB_2:%.*]] +; FEATURE: bb.2: +; FEATURE-NEXT: br label [[BB_3]] +; FEATURE: bb.3: +; FEATURE-NEXT: [[PHI2:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_2]] ], [ [[PHI1]], [[BB_1]] ] +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_5:%.*]], label [[BB_4:%.*]] +; FEATURE: bb.4: +; FEATURE-NEXT: [[VEC0:%.*]] = insertelement <10 x i8> [[PHI2]], i8 0, i64 0 +; FEATURE-NEXT: br label [[BB_5]] +; FEATURE: bb.5: +; FEATURE-NEXT: [[PHI3:%.*]] = phi <10 x i8> [ [[VEC0]], [[BB_4]] ], [ [[PHI2]], [[BB_3]] ] +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_7:%.*]], label [[BB_6:%.*]] +; FEATURE: bb.6: +; FEATURE-NEXT: br label [[BB_7]] +; FEATURE: bb.7: +; FEATURE-NEXT: [[PHI4:%.*]] = phi <10 x i8> [ [[INVEC0]], [[BB_6]] ], [ [[PHI3]], [[BB_5]] ] +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_9:%.*]], label [[BB_8:%.*]] +; FEATURE: bb.8: +; FEATURE-NEXT: br label [[BB_9]] +; FEATURE: bb.9: +; FEATURE-NEXT: [[PHI5:%.*]] = phi <10 x i8> [ [[INVEC0]], [[BB_8]] ], [ [[PHI4]], [[BB_7]] ] +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_11]], label [[BB_10:%.*]] +; FEATURE: bb.10: +; FEATURE-NEXT: br label [[BB_11]] +; FEATURE: bb.11: +; FEATURE-NEXT: [[PHI6:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_10]] ], [ [[PHI5]], [[BB_9]] ] +; FEATURE-NEXT: [[VEC1]] = shufflevector <10 x i8> [[PHI6]], <10 x i8> zeroinitializer, <10 x i32> +; FEATURE-NEXT: br label [[BB_1]] +; +; DEFAULT-LABEL: define amdgpu_kernel void @deletedPHI( +; DEFAULT-SAME: i32 [[IN0:%.*]], i1 [[CMP:%.*]], <10 x i8> [[INVEC0:%.*]]) #[[ATTR0]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: br label [[BB_1:%.*]] +; DEFAULT: bb.1: +; DEFAULT-NEXT: [[PHI0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[BB_11:%.*]] ] +; DEFAULT-NEXT: [[PHI1:%.*]] = phi <10 x i8> [ splat (i8 1), [[ENTRY]] ], [ [[VEC1:%.*]], [[BB_11]] ] +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_3:%.*]], label [[BB_2:%.*]] +; DEFAULT: bb.2: +; DEFAULT-NEXT: br label [[BB_3]] +; DEFAULT: bb.3: +; DEFAULT-NEXT: [[PHI2:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_2]] ], [ [[PHI1]], [[BB_1]] ] +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_5:%.*]], label [[BB_4:%.*]] +; DEFAULT: bb.4: +; DEFAULT-NEXT: [[VEC0:%.*]] = insertelement <10 x i8> [[PHI2]], i8 0, i64 0 +; DEFAULT-NEXT: br label [[BB_5]] +; DEFAULT: bb.5: +; DEFAULT-NEXT: [[PHI3:%.*]] = phi <10 x i8> [ [[VEC0]], [[BB_4]] ], [ [[PHI2]], [[BB_3]] ] +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_7:%.*]], label [[BB_6:%.*]] +; DEFAULT: bb.6: +; DEFAULT-NEXT: br label [[BB_7]] +; DEFAULT: bb.7: +; DEFAULT-NEXT: [[PHI4:%.*]] = phi <10 x i8> [ [[INVEC0]], [[BB_6]] ], [ [[PHI3]], [[BB_5]] ] +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_9:%.*]], label [[BB_8:%.*]] +; DEFAULT: bb.8: +; DEFAULT-NEXT: br label [[BB_9]] +; DEFAULT: bb.9: +; DEFAULT-NEXT: [[PHI5:%.*]] = phi <10 x i8> [ [[INVEC0]], [[BB_8]] ], [ [[PHI4]], [[BB_7]] ] +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_11]], label [[BB_10:%.*]] +; DEFAULT: bb.10: +; DEFAULT-NEXT: br label [[BB_11]] +; DEFAULT: bb.11: +; DEFAULT-NEXT: [[PHI6:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_10]] ], [ [[PHI5]], [[BB_9]] ] +; DEFAULT-NEXT: [[VEC1]] = shufflevector <10 x i8> [[PHI6]], <10 x i8> zeroinitializer, <10 x i32> +; DEFAULT-NEXT: br label [[BB_1]] +; entry: br label %bb.1 @@ -534,6 +1088,56 @@ define amdgpu_kernel void @multiple_unwind(i1 %cmp, <10 x i8> %invec) { ; GFX906: bb.8: ; GFX906-NEXT: br label [[BB_1]] ; +; FEATURE-LABEL: define amdgpu_kernel void @multiple_unwind( +; FEATURE-SAME: i1 [[CMP:%.*]], <10 x i8> [[INVEC:%.*]]) #[[ATTR0]] { +; FEATURE-NEXT: entry: +; FEATURE-NEXT: br label [[BB_1:%.*]] +; FEATURE: bb.1: +; FEATURE-NEXT: [[PHI0:%.*]] = phi <10 x i8> [ splat (i8 1), [[ENTRY:%.*]] ], [ [[PHI3:%.*]], [[BB_8:%.*]] ] +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_3:%.*]], label [[BB_2:%.*]] +; FEATURE: bb.2: +; FEATURE-NEXT: br label [[BB_3]] +; FEATURE: bb.3: +; FEATURE-NEXT: [[PHI1:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_2]] ], [ [[PHI0]], [[BB_1]] ] +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_5:%.*]], label [[BB_4:%.*]] +; FEATURE: bb.4: +; FEATURE-NEXT: br label [[BB_5]] +; FEATURE: bb.5: +; FEATURE-NEXT: [[PHI2:%.*]] = phi <10 x i8> [ [[PHI0]], [[BB_4]] ], [ [[PHI1]], [[BB_3]] ] +; FEATURE-NEXT: br i1 [[CMP]], label [[BB_7:%.*]], label [[BB_6:%.*]] +; FEATURE: bb.6: +; FEATURE-NEXT: br label [[BB_7]] +; FEATURE: bb.7: +; FEATURE-NEXT: [[PHI3]] = phi <10 x i8> [ [[INVEC]], [[BB_6]] ], [ [[PHI2]], [[BB_5]] ] +; FEATURE-NEXT: br label [[BB_8]] +; FEATURE: bb.8: +; FEATURE-NEXT: br label [[BB_1]] +; +; DEFAULT-LABEL: define amdgpu_kernel void @multiple_unwind( +; DEFAULT-SAME: i1 [[CMP:%.*]], <10 x i8> [[INVEC:%.*]]) #[[ATTR0]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: br label [[BB_1:%.*]] +; DEFAULT: bb.1: +; DEFAULT-NEXT: [[PHI0:%.*]] = phi <10 x i8> [ splat (i8 1), [[ENTRY:%.*]] ], [ [[PHI3:%.*]], [[BB_8:%.*]] ] +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_3:%.*]], label [[BB_2:%.*]] +; DEFAULT: bb.2: +; DEFAULT-NEXT: br label [[BB_3]] +; DEFAULT: bb.3: +; DEFAULT-NEXT: [[PHI1:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_2]] ], [ [[PHI0]], [[BB_1]] ] +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_5:%.*]], label [[BB_4:%.*]] +; DEFAULT: bb.4: +; DEFAULT-NEXT: br label [[BB_5]] +; DEFAULT: bb.5: +; DEFAULT-NEXT: [[PHI2:%.*]] = phi <10 x i8> [ [[PHI0]], [[BB_4]] ], [ [[PHI1]], [[BB_3]] ] +; DEFAULT-NEXT: br i1 [[CMP]], label [[BB_7:%.*]], label [[BB_6:%.*]] +; DEFAULT: bb.6: +; DEFAULT-NEXT: br label [[BB_7]] +; DEFAULT: bb.7: +; DEFAULT-NEXT: [[PHI3]] = phi <10 x i8> [ [[INVEC]], [[BB_6]] ], [ [[PHI2]], [[BB_5]] ] +; DEFAULT-NEXT: br label [[BB_8]] +; DEFAULT: bb.8: +; DEFAULT-NEXT: br label [[BB_1]] +; entry: br label %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir index 4762760c4ba24..8bd053ea24d2f 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm.mir @@ -212,7 +212,7 @@ body: | %9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %10:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %8.sub0:sreg_64, 0, implicit $exec %11:vgpr_32 = V_MOV_B32_dpp %9:vgpr_32, %10:vgpr_32, 312, 15, 15, 0, implicit $exec - %12:sreg_32 = V_READLANE_B32 %11:vgpr_32, 63 + %12:sreg_32_xm0 = V_READLANE_B32 %11:vgpr_32, 63 early-clobber %13:vgpr_32 = STRICT_WWM %9:vgpr_32, implicit $exec BUFFER_STORE_DWORD_OFFSET_exact killed %13, %4, %5, 4, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 08cc2e4ec7d79..6288a80446cf0 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -301,7 +301,7 @@ define hidden i32 @called(i32 %a) noinline { ret i32 %sub } -define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { +define amdgpu_kernel void @call(ptr addrspace(8) %tmp14, i32 %arg) { ; GFX9-O0-LABEL: call: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_mov_b32 s32, 0 @@ -533,7 +533,7 @@ define i64 @called_i64(i64 %a) noinline { ret i64 %sub } -define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { +define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) { ; GFX9-O0-LABEL: call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_mov_b32 s32, 0 @@ -1153,7 +1153,7 @@ define hidden i32 @strict_wwm_called(i32 %a) noinline { ret i32 %sub } -define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { +define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) %tmp14, i32 %arg) { ; GFX9-O0-LABEL: strict_wwm_call: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_mov_b32 s32, 0 @@ -1385,7 +1385,7 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline { ret i64 %sub } -define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { +define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg) { ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll index 6427a3e34cf8e..186276b50ceeb 100644 --- a/llvm/test/CodeGen/ARM/select-imm.ll +++ b/llvm/test/CodeGen/ARM/select-imm.ll @@ -295,15 +295,13 @@ define i32 @t7(i32 %a, i32 %b) nounwind readnone { ; ARM-LABEL: t7: ; ARM: @ %bb.0: @ %entry ; ARM-NEXT: subs r0, r0, r1 -; ARM-NEXT: movne r0, #1 -; ARM-NEXT: lsl r0, r0, #2 +; ARM-NEXT: movne r0, #4 ; ARM-NEXT: mov pc, lr ; ; ARMT2-LABEL: t7: ; ARMT2: @ %bb.0: @ %entry ; ARMT2-NEXT: subs r0, r0, r1 -; ARMT2-NEXT: movwne r0, #1 -; ARMT2-NEXT: lsl r0, r0, #2 +; ARMT2-NEXT: movwne r0, #4 ; ARMT2-NEXT: bx lr ; ; THUMB1-LABEL: t7: @@ -318,8 +316,7 @@ define i32 @t7(i32 %a, i32 %b) nounwind readnone { ; THUMB2: @ %bb.0: @ %entry ; THUMB2-NEXT: subs r0, r0, r1 ; THUMB2-NEXT: it ne -; THUMB2-NEXT: movne r0, #1 -; THUMB2-NEXT: lsls r0, r0, #2 +; THUMB2-NEXT: movne r0, #4 ; THUMB2-NEXT: bx lr ; ; V8MBASE-LABEL: t7: @@ -824,15 +821,13 @@ define i32 @t12(i32 %a) nounwind { ; ARM-LABEL: t12: ; ARM: @ %bb.0: @ %entry ; ARM-NEXT: cmp r0, #0 -; ARM-NEXT: movne r0, #1 -; ARM-NEXT: lsl r0, r0, #1 +; ARM-NEXT: movne r0, #2 ; ARM-NEXT: mov pc, lr ; ; ARMT2-LABEL: t12: ; ARMT2: @ %bb.0: @ %entry ; ARMT2-NEXT: cmp r0, #0 -; ARMT2-NEXT: movwne r0, #1 -; ARMT2-NEXT: lsl r0, r0, #1 +; ARMT2-NEXT: movwne r0, #2 ; ARMT2-NEXT: bx lr ; ; THUMB1-LABEL: t12: @@ -846,8 +841,7 @@ define i32 @t12(i32 %a) nounwind { ; THUMB2: @ %bb.0: @ %entry ; THUMB2-NEXT: cmp r0, #0 ; THUMB2-NEXT: it ne -; THUMB2-NEXT: movne r0, #1 -; THUMB2-NEXT: lsls r0, r0, #1 +; THUMB2-NEXT: movne r0, #2 ; THUMB2-NEXT: bx lr ; ; V8MBASE-LABEL: t12: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir b/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir index 4d6e33cf0b68a..b427c5bdd7229 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir @@ -1,8 +1,7 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck %s # CHECK: csr_sgpr_spill -# CHECK: spillPhysVGPRs -# CHECK-NEXT: - '$vgpr0' +# CHECK-NOT: spillPhysVGPRs --- name: csr_sgpr_spill tracksRegLiveness: true diff --git a/llvm/test/CodeGen/MIR/X86/pr126107.mir b/llvm/test/CodeGen/MIR/X86/pr126107.mir new file mode 100644 index 0000000000000..e8b3e47f6ff74 --- /dev/null +++ b/llvm/test/CodeGen/MIR/X86/pr126107.mir @@ -0,0 +1,17 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -o - %s -mtriple=x86_64-- -run-pass=machine-cp | FileCheck %s + +--- +name: main +body: | + bb.0.entry: + liveins: $ymm7 + ; CHECK-LABEL: name: main + ; CHECK: liveins: $ymm7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $ymm6 = COPY killed renamable $ymm7 + ; CHECK-NEXT: CALL64r killed renamable $rax, csr_64_mostregs + ; CHECK-NEXT: renamable $ymm6 = VPADDWZ256rr $ymm6, $ymm6 + renamable $ymm6 = COPY killed renamable $ymm7 + CALL64r killed renamable $rax, csr_64_mostregs + renamable $ymm6 = VPADDWZ256rr $ymm6, $ymm6 diff --git a/llvm/test/CodeGen/MSP430/shift-amount-threshold.ll b/llvm/test/CodeGen/MSP430/shift-amount-threshold.ll index 8166c4688f3d3..1ffae4f982fc2 100644 --- a/llvm/test/CodeGen/MSP430/shift-amount-threshold.ll +++ b/llvm/test/CodeGen/MSP430/shift-amount-threshold.ll @@ -115,13 +115,12 @@ define i16 @testShiftAnd_1(i16 %x) { ; CHECK-LABEL: testShiftAnd_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: mov r12, r13 -; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: mov #2, r12 ; CHECK-NEXT: tst r13 ; CHECK-NEXT: jl .LBB6_2 ; CHECK-NEXT: ; %bb.1: ; %entry ; CHECK-NEXT: clr r12 ; CHECK-NEXT: .LBB6_2: ; %entry -; CHECK-NEXT: add r12, r12 ; CHECK-NEXT: ret entry: %cmp = icmp slt i16 %x, 0 diff --git a/llvm/test/CodeGen/Thumb/branchless-cmp.ll b/llvm/test/CodeGen/Thumb/branchless-cmp.ll index 40c5b8853da9c..e5bfb87f00c92 100644 --- a/llvm/test/CodeGen/Thumb/branchless-cmp.ll +++ b/llvm/test/CodeGen/Thumb/branchless-cmp.ll @@ -1,101 +1,117 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m0 %s -verify-machineinstrs -o - | FileCheck %s define i32 @test1a(i32 %a, i32 %b) { +; CHECK-LABEL: test1a: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, r0, r1 +; CHECK-NEXT: subs r1, r0, #1 +; CHECK-NEXT: sbcs r0, r1 +; CHECK-NEXT: bx lr entry: %cmp = icmp ne i32 %a, %b %cond = zext i1 %cmp to i32 ret i32 %cond -; CHECK-LABEL: test1a: -; CHECK-NOT: b{{(ne)|(eq)}} -; CHECK: subs r0, r0, r1 -; CHECK-NEXT: subs r1, r0, #1 -; CHECK-NEXT: sbcs r0, r1 } define i32 @test1b(i32 %a, i32 %b) { +; CHECK-LABEL: test1b: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r1, r0, r1 +; CHECK-NEXT: rsbs r0, r1, #0 +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: bx lr entry: %cmp = icmp eq i32 %a, %b %cond = zext i1 %cmp to i32 ret i32 %cond -; CHECK-LABEL: test1b: -; CHECK-NOT: b{{(ne)|(eq)}} -; CHECK: subs r1, r0, r1 -; CHECK-NEXT: rsbs r0, r1, #0 -; CHECK-NEXT: adcs r0, r1 } define i32 @test2a(i32 %a, i32 %b) { +; CHECK-LABEL: test2a: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r1, r0, r1 +; CHECK-NEXT: rsbs r0, r1, #0 +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: bx lr entry: %cmp = icmp eq i32 %a, %b %cond = zext i1 %cmp to i32 ret i32 %cond -; CHECK-LABEL: test2a: -; CHECK-NOT: b{{(ne)|(eq)}} -; CHECK: subs r1, r0, r1 -; CHECK-NEXT: rsbs r0, r1, #0 -; CHECK-NEXT: adcs r0, r1 } define i32 @test2b(i32 %a, i32 %b) { +; CHECK-LABEL: test2b: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, r0, r1 +; CHECK-NEXT: subs r1, r0, #1 +; CHECK-NEXT: sbcs r0, r1 +; CHECK-NEXT: bx lr entry: %cmp = icmp ne i32 %a, %b %cond = zext i1 %cmp to i32 ret i32 %cond -; CHECK-LABEL: test2b: -; CHECK-NOT: b{{(ne)|(eq)}} -; CHECK: subs r0, r0, r1 -; CHECK-NEXT: subs r1, r0, #1 -; CHECK-NEXT: sbcs r0, r1 } define i32 @test3a(i32 %a, i32 %b) { +; CHECK-LABEL: test3a: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, r0, r1 +; CHECK-NEXT: beq .LBB4_2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: movs r0, #4 +; CHECK-NEXT: .LBB4_2: @ %entry +; CHECK-NEXT: bx lr entry: %cmp = icmp eq i32 %a, %b %cond = select i1 %cmp, i32 0, i32 4 ret i32 %cond -; CHECK-LABEL: test3a: -; CHECK-NOT: b{{(ne)|(eq)}} -; CHECK: subs r0, r0, r1 -; CHECK-NEXT: subs r1, r0, #1 -; CHECK-NEXT: sbcs r0, r1 -; CHECK-NEXT: lsls r0, r0, #2 } define i32 @test3b(i32 %a, i32 %b) { +; CHECK-LABEL: test3b: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: beq .LBB5_2 +; CHECK-NEXT: @ %bb.1: @ %entry +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .LBB5_2: +; CHECK-NEXT: movs r0, #4 +; CHECK-NEXT: bx lr entry: %cmp = icmp eq i32 %a, %b %cond = select i1 %cmp, i32 4, i32 0 ret i32 %cond -; CHECK-LABEL: test3b: -; CHECK-NOT: b{{(ne)|(eq)}} -; CHECK: subs r0, r0, r1 -; CHECK-NEXT: rsbs r1, r0, #0 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: lsls r0, r1, #2 } define i32 @test4a(i32 %a, i32 %b) { +; CHECK-LABEL: test4a: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: bne .LBB6_2 +; CHECK-NEXT: @ %bb.1: @ %entry +; CHECK-NEXT: movs r0, #4 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: bx lr entry: %cmp = icmp ne i32 %a, %b %cond = select i1 %cmp, i32 0, i32 4 ret i32 %cond -; CHECK-LABEL: test4a: -; CHECK-NOT: b{{(ne)|(eq)}} -; CHECK: subs r0, r0, r1 -; CHECK-NEXT: rsbs r1, r0, #0 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: lsls r0, r1, #2 } define i32 @test4b(i32 %a, i32 %b) { +; CHECK-LABEL: test4b: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, r0, r1 +; CHECK-NEXT: subs r1, r0, #1 +; CHECK-NEXT: sbcs r0, r1 +; CHECK-NEXT: lsls r0, r0, #2 +; CHECK-NEXT: bx lr entry: %cmp = icmp ne i32 %a, %b %cond = select i1 %cmp, i32 4, i32 0 ret i32 %cond -; CHECK-LABEL: test4b: -; CHECK-NOT: b{{(ne)|(eq)}} -; CHECK: subs r0, r0, r1 -; CHECK-NEXT: subs r1, r0, #1 -; CHECK-NEXT: sbcs r0, r1 -; CHECK-NEXT: lsls r0, r0, #2 } diff --git a/llvm/test/CodeGen/X86/merge-huge-sp-updates.ll b/llvm/test/CodeGen/X86/merge-huge-sp-updates.ll new file mode 100644 index 0000000000000..b26345e2d5bbc --- /dev/null +++ b/llvm/test/CodeGen/X86/merge-huge-sp-updates.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s -mtriple=x86_64-linux-unknown -verify-machineinstrs -o %t.s +; RUN: FileCheck --input-file=%t.s %s + +; Double-check that we are able to assemble the generated '.s'. A symptom of the +; problem that led to this test is an assembler failure when using +; '-save-temps'. For example: +; +; > ...s:683:7: error: invalid operand for instruction +; > addq $2147483679, %rsp # imm = 0x8000001F +; +; RUN: llvm-mc -triple x86_64-unknown-unknown %t.s + +; Check that the stack update after calling bar gets merged into the second add +; and not the first which is already at the chunk size limit (0x7FFFFFFF). + +define void @foo(ptr %rhs) { +; CHECK-LABEL: foo +entry: + %lhs = alloca [5 x [5 x [3 x [162 x [161 x [161 x double]]]]]], align 16 + store ptr %lhs, ptr %rhs, align 8 + %0 = call i32 @baz() + call void @bar(i64 0, i64 0, i64 0, i64 0, i64 0, ptr null, ptr %rhs, ptr null, ptr %rhs) +; CHECK: call{{.*}}bar +; CHECK: addq{{.*}}$2147483647, %rsp +; CHECK: addq{{.*}}$372037585, %rsp +; CHECK: .cfi_adjust_cfa_offset -2519521232 + ret void +} + +declare void @bar(i64, i64, i64, i64, i64, ptr, ptr, ptr, ptr) + +declare i32 @baz() diff --git a/llvm/test/DebugInfo/AMDGPU/live-debug-values-spill-tracking.mir b/llvm/test/DebugInfo/AMDGPU/live-debug-values-spill-tracking.mir new file mode 100644 index 0000000000000..31bdb1194880a --- /dev/null +++ b/llvm/test/DebugInfo/AMDGPU/live-debug-values-spill-tracking.mir @@ -0,0 +1,71 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass livedebugvalues %s -o - -debug-only livedebugvalues 2>&1 | FileCheck %s + +# Verify that spill tracking is disabled on amdgcn. + +# CHECK: Disabling InstrRefBasedLDV spill tracking for kern since target has too many potential stack slot indexes + +--- | + define void @kern() #0 !dbg !9 { + ret void, !dbg !15 + } + + attributes #0 = { noinline nounwind optnone "target-cpu"="gfx1100" } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!2, !3, !4, !5, !6, !7} + !llvm.ident = !{!8} + + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 19.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) + !1 = !DIFile(filename: "t.cpp", directory: "/") + !2 = !{i32 1, !"amdhsa_code_object_version", i32 500} + !3 = !{i32 7, !"Dwarf Version", i32 5} + !4 = !{i32 2, !"Debug Info Version", i32 3} + !5 = !{i32 1, !"wchar_size", i32 4} + !6 = !{i32 8, !"PIC Level", i32 2} + !7 = !{i32 7, !"frame-pointer", i32 2} + !8 = !{!"clang version 19.0.0"} + !9 = distinct !DISubprogram(name: "kern", linkageName: "kern", scope: !1, file: !1, line: 1, type: !10, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12) + !10 = !DISubroutineType(types: !11) + !11 = !{} + !12 = !{!13} + !13 = !DILocalVariable(name: "var", scope: !9, file: !1, line: 1, type: !14) + !14 = !DIBasicType(name: "i32", size: 32, encoding: DW_ATE_signed) + !15 = !DILocation(line: 1, column: 1, scope: !9) + +... +--- +name: kern +tracksRegLiveness: true +debugInstrRef: true +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: '$sgpr32' + hasSpilledVGPRs: true +body: | + bb.0: + ; CHECK-LABEL: name: kern + ; CHECK: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr0 + ; CHECK-NEXT: DBG_INSTR_REF !13, !DIExpression(DIOpArg(0, i32)), dbg-instr-ref(1, 0), debug-location !15 + ; CHECK-NEXT: DBG_VALUE_LIST !13, !DIExpression(DIOpArg(0, i32)), $noreg, debug-location !15 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, debug-instr-number 1, debug-location !15 + ; CHECK-NEXT: DBG_VALUE_LIST !13, !DIExpression(DIOpArg(0, i32)), $vgpr0, debug-location !15 + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, debug-location !15 :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: S_NOP 0, debug-location !15 + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, debug-location !15 :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: S_ENDPGM 0, debug-location !15 + frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02 + frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 + frame-setup CFI_INSTRUCTION undefined $vgpr0 + DBG_INSTR_REF !13, !DIExpression(DIOpArg(0, i32)), dbg-instr-ref(1, 0), debug-location !15 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec, debug-instr-number 1, debug-location !15 + SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, debug-location !15 :: (store (s32) into %stack.0, addrspace 5) + S_NOP 0, debug-location !15 + $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, debug-location !15 :: (load (s32) from %stack.0, addrspace 5) + S_ENDPGM 0, debug-location !15 + +... diff --git a/llvm/test/DebugInfo/verify-diop-based-diexpression.ll b/llvm/test/DebugInfo/verify-diop-based-diexpression.ll index c74ad445b1326..e7af8074f86a6 100644 --- a/llvm/test/DebugInfo/verify-diop-based-diexpression.ll +++ b/llvm/test/DebugInfo/verify-diop-based-diexpression.ll @@ -26,6 +26,15 @@ entry: ; CHECK: #dbg_declare(i8 poison, ![[#]], !DIExpression(DIOpArg(0, i32)), ![[#]]) call void @llvm.dbg.declare(metadata i8 poison, metadata !24, metadata !DIExpression(DIOpArg(0, i32))), !dbg !22 + ; CHECK: #dbg_declare(ptr %i, ![[#]], !DIExpression(DIOpArg(0, ptr), DIOpDeref(%struct.type), DIOpConstant(i32 64), DIOpBitOffset(ptr)), ![[#]]) + call void @llvm.dbg.declare(metadata ptr %i, metadata !26, metadata !DIExpression(DIOpArg(0, ptr), DIOpDeref(%struct.type), DIOpConstant(i32 64), DIOpBitOffset(ptr))), !dbg !22 + + ; CHECK: #dbg_declare(ptr %i, ![[#]], !DIExpression(DIOpArg(0, ptr), DIOpDeref(%struct.type), DIOpConstant(i32 8), DIOpByteOffset(ptr)), ![[#]]) + call void @llvm.dbg.declare(metadata ptr %i, metadata !27, metadata !DIExpression(DIOpArg(0, ptr), DIOpDeref(%struct.type), DIOpConstant(i32 8), DIOpByteOffset(ptr))), !dbg !22 + + ; CHECK: #dbg_declare(i32 3, ![[#]], !DIExpression(DIOpArg(0, i32), DIOpConstant(<2 x i32> ), DIOpConstant(<2 x i32> ), DIOpSelect()), ![[#]]) + call void @llvm.dbg.declare(metadata i32 3, metadata !28, metadata !DIExpression(DIOpArg(0, i32), DIOpConstant(<2 x i32> ), DIOpConstant(<2 x i32> ), DIOpSelect())), !dbg !22 + ret void } @@ -55,6 +64,10 @@ entry: !22 = !DILocation(line: 12, column: 7, scope: !17) !23 = !DILocation(line: 13, column: 1, scope: !17) !24 = !DILocalVariable(name: "j", scope: !17, file: !1, line: 12, type: !10) +!25 = !DIBasicType(name: "int64", size: 64, encoding: DW_ATE_unsigned) +!26 = !DILocalVariable(name: "k", scope: !17, file: !1, line: 12, type: !25) +!27 = !DILocalVariable(name: "l", scope: !17, file: !1, line: 12, type: !25) +!28 = !DILocalVariable(name: "m", scope: !17, file: !1, line: 12, type: !25) ;--- invalid.ll ; RUN: opt invalid.ll -S -passes=verify 2>&1 | FileCheck invalid.ll diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s index 34c42affdd46c..364463f9404bc 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s @@ -1922,3 +1922,21 @@ ds_bpermute_fi_b32 v5, v1, v2 offset:0 ds_bpermute_fi_b32 v255, v255, v255 offset:4 // GFX12: encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff] + +ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] +// GFX12: encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01] + +ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1 +// GFX12: encoding: [0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01] + +ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] +// GFX12: encoding: [0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01] + +ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1 +// GFX12: encoding: [0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01] + +ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] +// GFX12: encoding: [0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe] + +ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] offset:127 +// GFX12: encoding: [0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s index c10b96a292178..55e284d4afde9 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s @@ -35,3 +35,6 @@ ds_subrev_u64 v1, v[2:3] ds_subrev_rtn_u64 v[5:6], v1, v[2:3] // GFX12: ds_rsub_rtn_u64 v[5:6], v1, v[2:3] ; encoding: [0x00,0x00,0x88,0xd9,0x01,0x02,0x00,0x05] + +ds_bvh_stack_rtn_b32 v1, v0, v1, v[2:5] +// GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] ; encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s b/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s index 8bf9b92e8d1d8..c99123bbe1ee0 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s @@ -1066,6 +1066,12 @@ image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17], v[18:20]], image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17]], s[4:7] a16 // GFX12: encoding: [0x50,0x80,0xc6,0xd3,0x04,0x08,0x00,0x00,0x09,0x0b,0x0c,0x0f] +image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v[9:10]], s[0:3] +// GFX12: encoding: [0x10,0x00,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06] + +image_bvh8_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v9], s[0:3] +// GFX12: encoding: [0x10,0x40,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06] + image_get_resinfo v4, v32, s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_1D // GFX12: encoding: [0x00,0xc0,0x45,0xd0,0x04,0xc0,0x00,0x00,0x20,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s index b9999b671f7e7..f693fe3d22d26 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s @@ -41,3 +41,9 @@ bvh_intersect_ray v[4:7], [v9, v10, v[11:13], v[14:16], v[17:19]], s[4:7] bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17], v[18:20]], s[4:7] // GFX12: image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17], v[18:20]], s[4:7] ; encoding: [0x10,0x80,0xc6,0xd3,0x04,0x08,0x00,0x12,0x09,0x0b,0x0c,0x0f] + +bvh_dual_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v[9:10]], s[0:3] +// GFX12: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v[9:10]], s[0:3] ; encoding: [0x10,0x00,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06] + +bvh8_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v9], s[0:3] +// GFX12: image_bvh8_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v9], s[0:3] ; encoding: [0x10,0x40,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06] diff --git a/llvm/test/MC/AMDGPU/gfx950-unsupported.s b/llvm/test/MC/AMDGPU/gfx950-unsupported.s index 225784177ae18..8bdab2da2394c 100644 --- a/llvm/test/MC/AMDGPU/gfx950-unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx950-unsupported.s @@ -239,10 +239,6 @@ ds_read_b64_tr_b16 v[2:3], v2 offset:-64 //===----------------------------------------------------------------------===// // ds_read_b96_tr_b6 //===----------------------------------------------------------------------===// -ds_read_b96_tr_b6 v[1:3], v0 -// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU - ds_read_b96_tr_b6 v1, v0 // ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_features.s b/llvm/test/MC/AMDGPU/gfx950_asm_features.s index 389b17296c045..813940ae915ee 100644 --- a/llvm/test/MC/AMDGPU/gfx950_asm_features.s +++ b/llvm/test/MC/AMDGPU/gfx950_asm_features.s @@ -41,14 +41,26 @@ global_load_lds_dwordx4 v2, s[4:5] offset:4 // GFX950: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0xb3,0x02,0x7e] v_permlane16_swap_b32 v1, v2 +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: v_permlane16_swap_b32_e32 v218, v219 ; encoding: [0xdb,0xb3,0xb4,0x7f] +v_permlane16_swap_b32 v218, v219 + // NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: // GFX950: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0xb3,0x02,0x7e] v_permlane16_swap_b32_e32 v1, v2 +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: v_permlane16_swap_b32_e32 v218, v219 ; encoding: [0xdb,0xb3,0xb4,0x7f] +v_permlane16_swap_b32_e32 v218, v219 + // NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: // GFX950: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0x99,0xd1,0x02,0x01,0x00,0x00] v_permlane16_swap_b32_e64 v1, v2 +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: v_permlane16_swap_b32_e64 v218, v219 ; encoding: [0xda,0x00,0x99,0xd1,0xdb,0x01,0x00,0x00] +v_permlane16_swap_b32_e64 v218, v219 + // FIXME: Parsed as bound_ctrl:1? // NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: // GFX950: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0x99,0xd1,0x02,0x01,0x00,0x00] @@ -82,14 +94,26 @@ v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 // GFX950: v_permlane32_swap_b32_e32 v1, v2 ; encoding: [0x02,0xb5,0x02,0x7e] v_permlane32_swap_b32 v1, v2 +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: v_permlane32_swap_b32_e32 v218, v219 ; encoding: [0xdb,0xb5,0xb4,0x7f] +v_permlane32_swap_b32 v218, v219 + // NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: // GFX950: v_permlane32_swap_b32_e32 v1, v2 ; encoding: [0x02,0xb5,0x02,0x7e] v_permlane32_swap_b32_e32 v1, v2 +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: v_permlane32_swap_b32_e32 v218, v219 ; encoding: [0xdb,0xb5,0xb4,0x7f] +v_permlane32_swap_b32_e32 v218, v219 + // NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: // GFX950: v_permlane32_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0x9a,0xd1,0x02,0x01,0x00,0x00] v_permlane32_swap_b32_e64 v1, v2 +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: v_permlane32_swap_b32_e64 v218, v219 ; encoding: [0xda,0x00,0x9a,0xd1,0xdb,0x01,0x00,0x00] +v_permlane32_swap_b32_e64 v218, v219 + // FIXME: Parsed as bound_ctrl:1? // NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: // GFX950: v_permlane32_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0x9a,0xd1,0x02,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s b/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s index a6907caafcbb6..e6606ac8b72d0 100644 --- a/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s +++ b/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s @@ -32,3 +32,11 @@ ds_read_b96_tr_b6 v[0:2], v0 ds_read_b96_tr_b6 v[2:4], v2 offset:64 // GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU // GFX950: encoding: [0x40,0x00,0xc2,0xd9,0x02,0x00,0x00,0x02] + +ds_read_b96_tr_b6 v[1:3], v0 +// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +// GFX950: encoding: [0x00,0x00,0xc2,0xd9,0x00,0x00,0x00,0x01] + +ds_read_b96_tr_b6 v[1:3], v2 offset:64 +// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +// GFX950: encoding: [0x40,0x00,0xc2,0xd9,0x02,0x00,0x00,0x01] diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s index e0b832d8fe297..099916f48b5e7 100644 --- a/llvm/test/MC/AMDGPU/gfx950_err.s +++ b/llvm/test/MC/AMDGPU/gfx950_err.s @@ -434,3 +434,66 @@ v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], v38, v39 clamp // GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 clamp + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_f32_fp6 v[0:31], s[32:37], v6 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_f32_bf6 v[0:31], s[32:37], v6 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_f16_fp6 v[0:15], s[20:25], v8 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_bf16_fp6 v[0:15], s[20:25], v8 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_f16_bf6 v[0:15], s[20:25], v8 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_bf16_bf6 v[0:15], s[20:25], v8 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_fp6_f16 v[18:23], s[0:15], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_bf6_f16 v[18:23], s[0:15], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_fp6_bf16 v[18:23], s[0:15], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_bf6_bf16 v[18:23], s[0:15], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], s[0:15], v16, v17 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_sr_pk32_bf6_f16 v[20:25], s[0:15], v16, v17 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size +v_cvt_scalef32_sr_pk32_bf6_f32 v[36:41], s[0:31], v32, v33 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], s[0:15], v16, v17 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_sr_pk32_fp6_f16 v[20:25], s[0:15], v16, v17 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size +v_cvt_scalef32_sr_pk32_fp6_f32 v[36:41], s[0:31], v32, v33 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], s[0:15], v[6:21], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[6:21], s[0:15], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], s[0:15], v[6:21], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[6:21], s[0:15], v16 + +// GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v4, v5 +v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3], v4, v5 diff --git a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s index ea649bc76116a..50491533c73b3 100644 --- a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s @@ -29,7 +29,7 @@ // OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 -// OBJDUMP-NEXT: 0030 00000c60 80000000 00040000 00000000 +// OBJDUMP-NEXT: 0030 00000ce0 80000000 00040000 00000000 // complete // OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 @@ -39,12 +39,12 @@ // OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000 -// OBJDUMP-NEXT: 00b0 00000060 80000000 00040000 00000000 +// OBJDUMP-NEXT: 00b0 000000e0 80000000 00040000 00000000 // disabled_user_sgpr // OBJDUMP-NEXT: 00c0 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 -// OBJDUMP-NEXT: 00f0 00000c60 80000000 00040000 00000000 +// OBJDUMP-NEXT: 00f0 00000ce0 80000000 00040000 00000000 .text diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s index bec717e4137df..7f5240d649b7f 100644 --- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s +++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s @@ -126,16 +126,16 @@ expr_defined: // ASM-NEXT: .amdhsa_reserve_vcc defined_boolean // ASM-NEXT: .amdhsa_reserve_flat_scratch defined_boolean // ASM-NEXT: .amdhsa_reserve_xnack_mask 1 -// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12 -// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14 -// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16 -// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&786432)>>18 -// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21 -// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&8388608)>>23 -// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&67108864)>>26 -// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29 -// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30 -// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31 +// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&786432)>>18 +// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21 +// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&8388608)>>23 +// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&67108864)>>26 +// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29 +// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30 +// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31 // ASM-NEXT: .amdhsa_shared_vgpr_count 0 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&16777216)>>24 // ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&33554432)>>25 diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s index 85a7ad05b00f4..e4cbc14f99d70 100644 --- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s +++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s @@ -122,16 +122,16 @@ expr_defined: // ASM-NEXT: .amdhsa_next_free_vgpr defined_value+4 // ASM-NEXT: .amdhsa_next_free_sgpr defined_value+5 // ASM-NEXT: .amdhsa_reserve_vcc defined_boolean -// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12 -// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14 -// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16 -// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&786432)>>18 -// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21 -// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&8388608)>>23 -// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&67108864)>>26 -// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29 -// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30 -// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31 +// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&786432)>>18 +// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21 +// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&8388608)>>23 +// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&67108864)>>26 +// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29 +// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30 +// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31 // ASM-NEXT: .amdhsa_shared_vgpr_count 0 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&16777216)>>24 // ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&33554432)>>25 diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s index 51d0fb30b320c..ecec1d881f6b1 100644 --- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s +++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s @@ -124,15 +124,15 @@ expr_defined: // ASM-NEXT: .amdhsa_next_free_vgpr defined_value+4 // ASM-NEXT: .amdhsa_next_free_sgpr defined_value+5 // ASM-NEXT: .amdhsa_reserve_vcc defined_boolean -// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((1611399168|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12 -// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((1611399168|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14 -// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((1611399168|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16 -// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((1611399168|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&786432)>>18 -// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((1611399168|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&67108864)>>26 -// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((1611399168|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29 -// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((1611399168|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30 -// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((1611399168|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31 -// ASM-NEXT: .amdhsa_round_robin_scheduling (((((((((((((((((((((1611399168|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21 +// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&786432)>>18 +// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&67108864)>>26 +// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29 +// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30 +// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31 +// ASM-NEXT: .amdhsa_round_robin_scheduling (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&16777216)>>24 // ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&33554432)>>25 // ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&67108864)>>26 diff --git a/llvm/test/MC/AMDGPU/mai-gfx950-err.s b/llvm/test/MC/AMDGPU/mai-gfx950-err.s index e700b0b3cabfe..5c9dbd7f7636f 100644 --- a/llvm/test/MC/AMDGPU/mai-gfx950-err.s +++ b/llvm/test/MC/AMDGPU/mai-gfx950-err.s @@ -156,3 +156,51 @@ v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:9], v[0:3] v20, v21 blgp v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:11], v[0:3] v20, v21 blgp:4 // CHECK: :[[@LINE-1]]:53: error: wrong register tuple size for blgp value 4 + + +// Workaround a hardware bug to disallow sgpr/inline constants as scale operands + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 +// CHECK: :[[@LINE-1]]:77: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, 9 +// CHECK: :[[@LINE-1]]:77: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 4.0, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, 4.0 +// CHECK: :[[@LINE-1]]:77: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], -4.0, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 0.15915494, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 16, v49 +// CHECK: :[[@LINE-1]]:73: error: invalid operand for instruction + +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, -4.0 +// CHECK: :[[@LINE-1]]:78: error: invalid operand for instruction + +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 4.0, v24 +// CHECK: :[[@LINE-1]]:73: error: invalid operand for instruction + +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 0.15915494, v24 +// CHECK: :[[@LINE-1]]:73: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/mai-gfx950.s b/llvm/test/MC/AMDGPU/mai-gfx950.s index 23b1ba2c3cd13..4aa14f0538b8b 100644 --- a/llvm/test/MC/AMDGPU/mai-gfx950.s +++ b/llvm/test/MC/AMDGPU/mai-gfx950.s @@ -375,7 +375,7 @@ v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[4:11], a[4:9], a[0:15] cbsz:1 blgp:3 //===----------------------------------------------------------------------===// // v_mfma_scale_f32_16x16x128_f8f6f4 //===----------------------------------------------------------------------===// -// FIXME: Test op_sel, neg, clamp +// FIXME: Test neg, clamp // GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU @@ -405,70 +405,85 @@ v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], v[12:19], v[20:23], v24, v25 // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] blgp:2 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] blgp:2 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 + +// op_sel combinations -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x08,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,1,0] op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x18,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,1,0] cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 cbsz:3 blgp:1 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,1,0] cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] cbsz:3 blgp:1 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,0,0] cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,1,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[1,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[1,1,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[1,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[1,0,0] cbsz:3 blgp:1 + +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x10,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x08,0xac,0xd3,0x18,0x33,0x02,0x10,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] blgp:2 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[0,1,0] cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[1,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x08,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[1,0,0] cbsz:3 blgp:1 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04] +// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[1,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x08,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:3 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel:[1,0,0] op_sel_hi:[1,1,0] cbsz:3 blgp:1 + //===----------------------------------------------------------------------===// // v_mfma_scale_f32_32x32x64_f8f6f4 //===----------------------------------------------------------------------===// -// FIXME: Test op_sel, neg, clamp +// FIXME: Test neg, clamp // GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU @@ -514,10 +529,61 @@ v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 blgp:2 -// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:2 blgp:3 ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x10,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:2 blgp:3 +// op_sel combinations + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,0,0] ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[0,0,0] ; encoding: [0x00,0x08,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,1,0] op_sel_hi:[0,0,0] ; encoding: [0x00,0x18,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,1,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x10,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[1,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x08,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[1,0,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[1,1,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[1,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x10,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[1,1,0] ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[1,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x08,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[1,0,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x00,0x08,0xac,0xd3,0x30,0x63,0x02,0x10,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[0,1,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; encoding: [0x00,0x08,0xac,0xd3,0x30,0x63,0x02,0x08,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[1,0,0] + +// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[1,1,0] ; encoding: [0x00,0x08,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[1,0,0] op_sel_hi:[1,1,0] + + //===----------------------------------------------------------------------===// // v_mfma_f32_16x16x128_f8f6f4 with appropriate register widths //===----------------------------------------------------------------------===// diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt index d66748135ffd4..d9381b50ca29f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt @@ -3242,3 +3242,27 @@ # GFX12: ds_bpermute_fi_b32 v255, v255, v255 offset:4 ; encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff] 0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff + +# GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] ; encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01] +0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01 + +# GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1 ; encoding: [0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01] +0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01 + +# GFX12: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] ; encoding: [0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01] +0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01 + +# GFX12: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1 ; encoding: [0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01] +0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01 + +# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] ; encoding: [0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe] +0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe + +# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] offset:127 ; encoding: [0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe] +0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe + +# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v3, v4, v[5:12] offset:127 ; encoding: [0x7f,0x00,0x88,0xdb,0x03,0x04,0x05,0x01] +0x7f,0x00,0x88,0xdb,0x03,0x04,0x05,0x01 + +# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v3, v4, v[5:12] ; encoding: [0x00,0x00,0x88,0xdb,0x03,0x04,0x05,0x01] +0x00,0x00,0x88,0xdb,0x03,0x04,0x05,0x01 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt index 233c2e1b9d083..387bdf5a6018f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt @@ -1066,6 +1066,12 @@ # GFX12: image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17]], s[4:7] a16 ; encoding: [0x50,0x80,0xc6,0xd3,0x04,0x08,0x00,0x00,0x09,0x0b,0x0c,0x0f] 0x50,0x80,0xc6,0xd3,0x04,0x08,0x00,0x00,0x09,0x0b,0x0c,0x0f +# GFX12: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v[9:10]], s[0:3] ; encoding: [0x10,0x00,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06] +0x10,0x00,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06 + +# GFX12: image_bvh8_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v9], s[0:3] ; encoding: [0x10,0x40,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06] +0x10,0x40,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06 + # GFX12: image_get_resinfo v4, v32, s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0xc0,0x45,0xd0,0x04,0xc0,0x00,0x00,0x20,0x00,0x00,0x00] 0x00,0xc0,0x45,0xd0,0x04,0xc0,0x00,0x00,0x20,0x00,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt index 9fc9c58387b90..01821593b0707 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt @@ -47,9 +47,27 @@ # GFX950: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0xb3,0x02,0x7e] 0x02,0xb3,0x02,0x7e +# GFX950: v_permlane16_swap_b32_e32 v218, v219 ; encoding: [0xdb,0xb3,0xb4,0x7f] +0xdb,0xb3,0xb4,0x7f + +# GFX950: v_permlane16_swap_b32_e32 v218, v2 ; encoding: [0x02,0xb3,0xb4,0x7f] +0x02,0xb3,0xb4,0x7f + +# GFX950: v_permlane16_swap_b32_e32 v2, v219 ; encoding: [0xdb,0xb3,0x04,0x7e] +0xdb,0xb3,0x04,0x7e + # GFX950: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0x99,0xd1,0x02,0x01,0x00,0x00] 0x01,0x00,0x99,0xd1,0x02,0x01,0x00,0x00 +# GFX950: v_permlane16_swap_b32_e64 v218, v219 ; encoding: [0xda,0x00,0x99,0xd1,0xdb,0x01,0x00,0x00] +0xda,0x00,0x99,0xd1,0xdb,0x01,0x00,0x00 + +# GFX950: v_permlane16_swap_b32_e64 v218, v2 ; encoding: [0xda,0x00,0x99,0xd1,0x02,0x01,0x00,0x00] +0xda,0x00,0x99,0xd1,0x02,0x01,0x00,0x00 + +# GFX950: v_permlane16_swap_b32_e64 v2, v219 ; encoding: [0x02,0x00,0x99,0xd1,0xdb,0x01,0x00,0x00] +0x02,0x00,0x99,0xd1,0xdb,0x01,0x00,0x00 + # GFX950: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0x99,0xd1,0x02,0x01,0x00,0x00] 0x01,0x10,0x99,0xd1,0x02,0x01,0x00,0x00 @@ -63,9 +81,27 @@ # GFX950: v_permlane32_swap_b32_e32 v1, v2 ; encoding: [0x02,0xb5,0x02,0x7e] 0x02,0xb5,0x02,0x7e +# GFX950: v_permlane32_swap_b32_e32 v218, v219 ; encoding: [0xdb,0xb5,0xb4,0x7f] +0xdb,0xb5,0xb4,0x7f + +# GFX950: v_permlane32_swap_b32_e32 v218, v2 ; encoding: [0x02,0xb5,0xb4,0x7f] +0x02,0xb5,0xb4,0x7f + +# GFX950: v_permlane32_swap_b32_e32 v2, v219 ; encoding: [0xdb,0xb5,0x04,0x7e] +0xdb,0xb5,0x04,0x7e + # GFX950: v_permlane32_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0x9a,0xd1,0x02,0x01,0x00,0x00] 0x01,0x00,0x9a,0xd1,0x02,0x01,0x00,0x00 +# GFX950: v_permlane32_swap_b32_e64 v218, v219 ; encoding: [0xda,0x00,0x9a,0xd1,0xdb,0x01,0x00,0x00] +0xda,0x00,0x9a,0xd1,0xdb,0x01,0x00,0x00 + +# GFX950: v_permlane32_swap_b32_e64 v218, v2 ; encoding: [0xda,0x00,0x9a,0xd1,0x02,0x01,0x00,0x00] +0xda,0x00,0x9a,0xd1,0x02,0x01,0x00,0x00 + +# GFX950: v_permlane32_swap_b32_e64 v2, v219 ; encoding: [0x02,0x00,0x9a,0xd1,0xdb,0x01,0x00,0x00] +0x02,0x00,0x9a,0xd1,0xdb,0x01,0x00,0x00 + # GFX950: v_permlane32_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0x9a,0xd1,0x02,0x01,0x00,0x00] 0x01,0x10,0x9a,0xd1,0x02,0x01,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt index 10310f7ad1f3d..0b5dba24612ca 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt @@ -35,3 +35,9 @@ # GFX950: ds_read_b96_tr_b6 v[2:4], v2 offset:64 ; encoding: [0x40,0x00,0xc2,0xd9,0x02,0x00,0x00,0x02] 0x40,0x00,0xc2,0xd9,0x02,0x00,0x00,0x02 + +# GFX950: ds_read_b96_tr_b6 v[1:3], v0 ; encoding: [0x00,0x00,0xc2,0xd9,0x00,0x00,0x00,0x01] +0x00,0x00,0xc2,0xd9,0x00,0x00,0x00,0x01 + +# GFX950: ds_read_b96_tr_b6 v[1:3], v2 offset:64 ; encoding: [0x40,0x00,0xc2,0xd9,0x02,0x00,0x00,0x01] +0x40,0x00,0xc2,0xd9,0x02,0x00,0x00,0x01 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt index 8adc8b79fbbf5..e191455beb64d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt @@ -386,51 +386,21 @@ # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x84] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x84 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44] +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[1,1,0] blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x44 # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,1,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84 @@ -452,16 +422,16 @@ # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x64] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x64 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04] +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,1,0] cbsz:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x04 # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x08,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] +# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[1,1,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x18,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24 # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] @@ -530,7 +500,7 @@ # GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] 0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64 -# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] +# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[1,1,0] cbsz:2 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64] 0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x18,0x00,0x0a,0xae,0xd3,0x10,0x31,0x82,0x64 # GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x0b,0xae,0xd3,0x10,0x31,0x82,0x44] @@ -569,7 +539,6 @@ # GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[50:65], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] 0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04 - # GFX950: v_mfma_i32_16x16x64_i8 a[0:3], a[0:3], a[0:3], a[0:3] ; encoding: [0x00,0x80,0xb6,0xd3,0x00,0x01,0x02,0x1c] 0x00,0x80,0xb6,0xd3,0x00,0x01,0x02,0x1c diff --git a/llvm/test/Transforms/Attributor/value-simplify.ll b/llvm/test/Transforms/Attributor/value-simplify.ll index b81ad63d22b95..8cbbab52766db 100644 --- a/llvm/test/Transforms/Attributor/value-simplify.ll +++ b/llvm/test/Transforms/Attributor/value-simplify.ll @@ -12,6 +12,7 @@ declare ptr @llvm.call.preallocated.arg(token, i32) @ConstPtr = constant i32 0, align 4 @ConstWeakPtr = weak constant i32 0, align 4 @ConstWeakODRPtr = weak_odr constant i32 0, align 4 +@ExtInitZeroInit = externally_initialized constant i32 zeroinitializer, align 4 ;. ; CHECK: @str = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1 @@ -19,6 +20,7 @@ declare ptr @llvm.call.preallocated.arg(token, i32) ; CHECK: @ConstPtr = constant i32 0, align 4 ; CHECK: @ConstWeakPtr = weak constant i32 0, align 4 ; CHECK: @ConstWeakODRPtr = weak_odr constant i32 0, align 4 +; CHECK: @ExtInitZeroInit = externally_initialized constant i32 0, align 4 ; CHECK: @S = external global %struct.X ; CHECK: @g = internal constant { [2 x ptr] } { [2 x ptr] [ptr @f1, ptr @f2] } ; CHECK: @x = external global i32 @@ -1651,6 +1653,23 @@ define i32 @readWeakOdrConst() { ret i32 %l } +define i32 @readExtInitZeroInit() { +; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; TUNIT-LABEL: define {{[^@]+}}@readExtInitZeroInit +; TUNIT-SAME: () #[[ATTR2]] { +; TUNIT-NEXT: [[L:%.*]] = load i32, ptr @ExtInitZeroInit, align 4 +; TUNIT-NEXT: ret i32 [[L]] +; +; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; CGSCC-LABEL: define {{[^@]+}}@readExtInitZeroInit +; CGSCC-SAME: () #[[ATTR1]] { +; CGSCC-NEXT: [[L:%.*]] = load i32, ptr @ExtInitZeroInit, align 4 +; CGSCC-NEXT: ret i32 [[L]] +; + %l = load i32, ptr @ExtInitZeroInit + ret i32 %l +} + ;. ; TUNIT: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } ; TUNIT: attributes #[[ATTR1]] = { memory(readwrite, argmem: none) } diff --git a/llvm/test/Transforms/HipStdPar/allocation-interposition.ll b/llvm/test/Transforms/HipStdPar/allocation-interposition.ll index 9ec284b1dedb7..035400f361ba5 100644 --- a/llvm/test/Transforms/HipStdPar/allocation-interposition.ll +++ b/llvm/test/Transforms/HipStdPar/allocation-interposition.ll @@ -16,6 +16,12 @@ declare void @__hipstdpar_hidden_free(ptr) declare ptr @__hipstdpar_hidden_malloc(i64) +declare ptr @__hipstdpar_hidden_memalign(i64, i64) + +declare ptr @__hipstdpar_hidden_mmap(ptr, i64, i32, i32, i32, i64) + +declare i32 @__hipstdpar_hidden_munmap(ptr, i64) + declare ptr @__hipstdpar_realloc(ptr, i64) declare ptr @__hipstdpar_realloc_array(ptr, i64, i64) @@ -171,7 +177,21 @@ define dso_local noundef i32 @allocs() { ; CHECK: call void @__hipstdpar_free(ptr noundef %28) call void @__libc_free(ptr noundef %28) - ret i32 0 + ; CHECK: %29 = call ptr @__libc_malloc(i64 noundef 8) + %29 = call ptr @__hipstdpar_hidden_malloc(i64 noundef 8) + ; CHECK: call void @__libc_free(ptr noundef %29) + call void @__hipstdpar_hidden_free(ptr noundef %29) + + ; CHECK: %30 = call ptr @__libc_memalign(i64 noundef 8, i64 noundef 4) + %30 = call ptr @__hipstdpar_hidden_memalign(i64 noundef 8, i64 noundef 4) + ; CHECK: %31 = call ptr @mmap(ptr %30, i64 8, i32 0, i32 0, i32 0, i64 0) + %31 = call ptr @__hipstdpar_hidden_mmap(ptr %30, i64 8, i32 0, i32 0, i32 0, i64 0) + ; CHECK: %32 = call i32 @munmap(ptr %31, i64 8) + %32 = call i32 @__hipstdpar_hidden_munmap(ptr %31, i64 8) + ; CHECK: call void @__libc_free(ptr noundef %30) + call void @__hipstdpar_hidden_free(ptr noundef %30) + + ret i32 %32 } declare noalias ptr @aligned_alloc(i64 noundef, i64 noundef) @@ -220,4 +240,8 @@ declare void @__libc_free(ptr noundef) declare ptr @__libc_malloc(i64 noundef) -declare ptr @__libc_memalign(i64 noundef, i64 noundef) \ No newline at end of file +declare ptr @__libc_memalign(i64 noundef, i64 noundef) + +declare ptr @mmap(ptr noundef, i64 noundef, i32 noundef, i32 noundef, i32 noundef, i64 noundef) + +declare i32 @munmap(ptr noundef, i64 noundef) diff --git a/llvm/test/Transforms/HipStdPar/math-fixup.ll b/llvm/test/Transforms/HipStdPar/math-fixup.ll new file mode 100644 index 0000000000000..914ad2d264eba --- /dev/null +++ b/llvm/test/Transforms/HipStdPar/math-fixup.ll @@ -0,0 +1,498 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=hipstdpar-math-fixup %s | FileCheck %s + +define void @test_acos(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_acos( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_acos_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_acos_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.acos.f64(double %dbl) + %1 = call float @llvm.acos.f32(float %flt) + ret void +} + +define void @test_acosh(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_acosh( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_acosh_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_acosh_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @acosh(double %dbl) + %1 = call float @acoshf(float %flt) + ret void +} + +define void @test_asin(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_asin( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_asin_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_asin_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.asin.f64(double %dbl) + %1 = call float @llvm.asin.f32(float %flt) + ret void +} + +define void @test_asinh(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_asinh( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_asinh_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_asinh_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @asinh(double %dbl) + %1 = call float @asinhf(float %flt) + ret void +} + +define void @test_atan(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_atan( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_atan_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_atan_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.atan.f64(double %dbl) + %1 = call float @llvm.atan.f32(float %flt) + ret void +} + +define void @test_atanh(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_atanh( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_atanh_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_atanh_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @atanh(double %dbl) + %1 = call float @atanhf(float %flt) + ret void +} + +define void @test_atan2(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_atan2( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_atan2_f64(double [[DBL]], double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_atan2_f32(float [[FLT]], float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.atan2.f64(double %dbl, double %dbl) + %1 = call float @llvm.atan2.f32(float %flt, float %flt) + ret void +} + +define void @test_cbrt(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_cbrt( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_cbrt_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_cbrt_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @cbrt(double %dbl) + %1 = call float @cbrtf(float %flt) + ret void +} + +define void @test_cos(double %dbl) { +; CHECK-LABEL: define void @test_cos( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_cos_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.cos.f64(double %dbl) + ret void +} + +define void @test_cosh(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_cosh( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_cosh_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_cosh_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.cosh.f64(double %dbl) + %1 = call float @llvm.cosh.f32(float %flt) + ret void +} + +define void @test_erf(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_erf( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_erf_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_erf_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @erf(double %dbl) + %1 = call float @erff(float %flt) + ret void +} + +define void @test_erfc(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_erfc( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_erfc_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_erfc_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @erfc(double %dbl) + %1 = call float @erfcf(float %flt) + ret void +} + +define void @test_exp(double %dbl) { +; CHECK-LABEL: define void @test_exp( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_exp_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.exp.f64(double %dbl) + ret void +} + +define void @test_exp2(double %dbl) { +; CHECK-LABEL: define void @test_exp2( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_exp2_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.exp2.f64(double %dbl) + ret void +} + +define void @test_expm1(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_expm1( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_expm1_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_expm1_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @expm1(double %dbl) + %1 = call float @expm1f(float %flt) + ret void +} + +define void @test_fdim(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_fdim( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_fdim_f64(double [[DBL]], double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_fdim_f32(float [[FLT]], float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @fdim(double %dbl, double %dbl) + %1 = call float @fdimf(float %flt, float %flt) + ret void +} + +define void @test_hypot(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_hypot( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_hypot_f64(double [[DBL]], double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_hypot_f32(float [[FLT]], float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @hypot(double %dbl, double %dbl) + %1 = call float @hypotf(float %flt, float %flt) + ret void +} + +define void @test_lgamma(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_lgamma( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_lgamma_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_lgamma_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @lgamma(double %dbl) + %1 = call float @lgammaf(float %flt) + ret void +} + +define void @test_log(double %dbl) { +; CHECK-LABEL: define void @test_log( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_log_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.log.f64(double %dbl) + ret void +} + +define void @test_log10(double %dbl) { +; CHECK-LABEL: define void @test_log10( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_log10_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.log10.f64(double %dbl) + ret void +} + +define void @test_log2(double %dbl) { +; CHECK-LABEL: define void @test_log2( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_log2_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.log2.f64(double %dbl) + ret void +} + +define void @test_log1p(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_log1p( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_log1p_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_log1p_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @log1p(double %dbl) + %1 = call float @log1pf(float %flt) + ret void +} + +define void @test_pow(double %dbl) { +; CHECK-LABEL: define void @test_pow( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_pow_f64(double [[DBL]], double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.pow.f64(double %dbl, double %dbl) + ret void +} + +define void @test_remainder(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_remainder( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_remainder_f64(double [[DBL]], double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_remainder_f32(float [[FLT]], float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @remainder(double %dbl, double %dbl) + %1 = call float @remainderf(float %flt, float %flt) + ret void +} + +define void @test_remquo(double %dbl, float %flt, ptr %p) { +; CHECK-LABEL: define void @test_remquo( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_remquo_f64(double [[DBL]], double [[DBL]], ptr [[P]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_remquo_f32(float [[FLT]], float [[FLT]], ptr [[P]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @remquo(double %dbl, double %dbl, ptr %p) + %1 = call float @remquof(float %flt, float %flt, ptr %p) + ret void +} + +define void @test_sin(double %dbl) { +; CHECK-LABEL: define void @test_sin( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_sin_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.sin.f64(double %dbl) + ret void +} + +define void @test_sinh(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_sinh( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_sinh_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_sinh_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.sinh.f64(double %dbl) + %1 = call float @llvm.sinh.f32(float %flt) + ret void +} + +define void @test_tan(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_tan( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_tan_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_tan_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.tan.f64(double %dbl) + %1 = call float @llvm.tan.f32(float %flt) + ret void +} + +define void @test_tanh(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_tanh( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_tanh_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_tanh_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.tanh.f64(double %dbl) + %1 = call float @llvm.tanh.f32(float %flt) + ret void +} + +define void @test_tgamma(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_tgamma( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_tgamma_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_tgamma_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @tgamma(double %dbl) + %1 = call float @tgammaf(float %flt) + ret void +} + +declare hidden double @remainder(double, double) + +declare hidden float @remainderf(float, float) + +declare hidden double @remquo(double, double, ptr) + +declare hidden float @remquof(float, float, ptr) + +declare hidden double @fdim(double, double) + +declare hidden float @fdimf(float, float) + +declare double @llvm.exp.f64(double) + +declare float @llvm.exp.f32(float) + +declare double @llvm.exp2.f64(double) + +declare float @llvm.exp2.f32(float) + +declare hidden double @expm1(double) + +declare hidden float @expm1f(float) + +declare double @llvm.log.f64(double) + +declare double @llvm.log10.f64(double) + +declare double @llvm.log2.f64(double) + +declare hidden double @log1p(double) + +declare hidden float @log1pf(float) + +declare double @llvm.pow.f64(double, double) + +declare hidden double @cbrt(double) + +declare hidden float @cbrtf(float) + +declare hidden double @hypot(double, double) + +declare hidden float @hypotf(float, float) + +declare double @llvm.sin.f64(double) + +declare double @llvm.cos.f64(double) + +declare double @llvm.tan.f64(double) + +declare double @llvm.asin.f64(double) + +declare double @llvm.acos.f64(double) + +declare double @llvm.atan.f64(double) + +declare double @llvm.atan2.f64(double, double) + +declare double @llvm.sinh.f64(double) + +declare double @llvm.cosh.f64(double) + +declare double @llvm.tanh.f64(double) + +declare hidden double @asinh(double) + +declare hidden float @asinhf(float) + +declare hidden double @acosh(double) + +declare hidden float @acoshf(float) + +declare hidden double @atanh(double) + +declare hidden float @atanhf(float) + +declare hidden double @erf(double) + +declare hidden float @erff(float) + +declare hidden double @erfc(double) + +declare hidden float @erfcf(float) + +declare hidden double @tgamma(double) + +declare hidden float @tgammaf(float) + +declare hidden double @lgamma(double) + +declare hidden float @lgammaf(float) diff --git a/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll b/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll index c7231392229c9..2003b1a72206d 100644 --- a/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll +++ b/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll @@ -4,30 +4,32 @@ define i32 @remove_loop(i32 %size) #0 { ; CHECK-V8M-LABEL: @remove_loop( +; CHECK-V8M-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-V8M-NEXT: entry: -; CHECK-V8M-NEXT: [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31 +; CHECK-V8M-NEXT: br label %[[WHILE_COND:.*]] +; CHECK-V8M: while.cond: +; CHECK-V8M-NEXT: br i1 false, label %[[WHILE_COND]], label %[[WHILE_END:.*]] +; CHECK-V8M: while.end: +; CHECK-V8M-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], 31 ; CHECK-V8M-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31) ; CHECK-V8M-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]] ; CHECK-V8M-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 5 ; CHECK-V8M-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5 -; CHECK-V8M-NEXT: br label [[WHILE_COND:%.*]] -; CHECK-V8M: while.cond: -; CHECK-V8M-NEXT: br i1 false, label [[WHILE_COND]], label [[WHILE_END:%.*]] -; CHECK-V8M: while.end: ; CHECK-V8M-NEXT: [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]] ; CHECK-V8M-NEXT: ret i32 [[TMP4]] ; ; CHECK-V8A-LABEL: @remove_loop( +; CHECK-V8A-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-V8A-NEXT: entry: -; CHECK-V8A-NEXT: [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31 +; CHECK-V8A-NEXT: br label %[[WHILE_COND:.*]] +; CHECK-V8A: while.cond: +; CHECK-V8A-NEXT: br i1 false, label %[[WHILE_COND]], label %[[WHILE_END:.*]] +; CHECK-V8A: while.end: +; CHECK-V8A-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], 31 ; CHECK-V8A-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31) ; CHECK-V8A-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]] ; CHECK-V8A-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 5 ; CHECK-V8A-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5 -; CHECK-V8A-NEXT: br label [[WHILE_COND:%.*]] -; CHECK-V8A: while.cond: -; CHECK-V8A-NEXT: br i1 false, label [[WHILE_COND]], label [[WHILE_END:%.*]] -; CHECK-V8A: while.end: ; CHECK-V8A-NEXT: [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]] ; CHECK-V8A-NEXT: ret i32 [[TMP4]] ; diff --git a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll index f907f23e0b520..2261423766792 100644 --- a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll +++ b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll @@ -77,7 +77,6 @@ define dso_local arm_aapcscc void @test(ptr nocapture %pDest, ptr nocapture read ; CHECK-NEXT: [[CMP2780:%.*]] = icmp ugt i32 [[ADD25]], [[J_0_LCSSA]] ; CHECK-NEXT: br i1 [[CMP2780]], label [[FOR_BODY29_PREHEADER:%.*]], label [[FOR_END40]] ; CHECK: for.body29.preheader: -; CHECK-NEXT: [[TMP10:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]] ; CHECK-NEXT: br label [[FOR_BODY29:%.*]] ; CHECK: for.body29: ; CHECK-NEXT: [[J_184:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY29]] ], [ [[J_0_LCSSA]], [[FOR_BODY29_PREHEADER]] ] @@ -101,6 +100,7 @@ define dso_local arm_aapcscc void @test(ptr nocapture %pDest, ptr nocapture read ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ADD25]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END40_LOOPEXIT:%.*]], label [[FOR_BODY29]] ; CHECK: for.end40.loopexit: +; CHECK-NEXT: [[TMP10:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]] ; CHECK-NEXT: [[SCEVGEP93:%.*]] = getelementptr i16, ptr [[PSRCB_ADDR_1_LCSSA]], i32 [[TMP10]] ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, ptr [[PSRCA_ADDR_1_LCSSA]], i32 [[TMP10]] ; CHECK-NEXT: [[SCEVGEP94:%.*]] = getelementptr i32, ptr [[PDEST_ADDR_1_LCSSA]], i32 [[TMP10]] diff --git a/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll b/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll index 21806c7f2cdc3..1592b84480e3f 100644 --- a/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll +++ b/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll @@ -4,11 +4,11 @@ define i32 @logical_and_2ops(i32 %n, i32 %m) { ; CHECK-LABEL: @logical_and_2ops( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = freeze i32 [[M:%.*]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: br i1 false, label [[LOOP]], label [[EXIT:%.*]] ; CHECK: exit: +; CHECK-NEXT: [[TMP0:%.*]] = freeze i32 [[M:%.*]] ; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[N:%.*]]) ; CHECK-NEXT: ret i32 [[UMIN]] ; @@ -28,11 +28,11 @@ exit: define i32 @logical_or_2ops(i32 %n, i32 %m) { ; CHECK-LABEL: @logical_or_2ops( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = freeze i32 [[M:%.*]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: +; CHECK-NEXT: [[TMP0:%.*]] = freeze i32 [[M:%.*]] ; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[N:%.*]]) ; CHECK-NEXT: ret i32 [[UMIN]] ; @@ -52,13 +52,13 @@ exit: define i32 @logical_and_3ops(i32 %n, i32 %m, i32 %k) { ; CHECK-LABEL: @logical_and_3ops( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = freeze i32 [[K:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[M:%.*]] -; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP1]]) ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: br i1 false, label [[LOOP]], label [[EXIT:%.*]] ; CHECK: exit: +; CHECK-NEXT: [[TMP0:%.*]] = freeze i32 [[K:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[M:%.*]] +; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP1]]) ; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N:%.*]]) ; CHECK-NEXT: ret i32 [[UMIN1]] ; @@ -80,13 +80,13 @@ exit: define i32 @logical_or_3ops(i32 %n, i32 %m, i32 %k) { ; CHECK-LABEL: @logical_or_3ops( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = freeze i32 [[K:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[M:%.*]] -; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP1]]) ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: +; CHECK-NEXT: [[TMP0:%.*]] = freeze i32 [[K:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[M:%.*]] +; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP1]]) ; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N:%.*]]) ; CHECK-NEXT: ret i32 [[UMIN1]] ; diff --git a/llvm/test/Transforms/IndVarSimplify/exit_value_test3.ll b/llvm/test/Transforms/IndVarSimplify/exit_value_test3.ll index aba7532f5ed92..c03cd95a8c861 100644 --- a/llvm/test/Transforms/IndVarSimplify/exit_value_test3.ll +++ b/llvm/test/Transforms/IndVarSimplify/exit_value_test3.ll @@ -4,9 +4,9 @@ ; is high because the loop can be deleted after the exit value rewrite. ; ; CHECK-LABEL: @_Z3fooPKcjj( -; CHECK: udiv ; CHECK: [[LABEL:^[a-zA-Z0-9_.]+]]: ; CHECK-NOT: br {{.*}} [[LABEL]] +; CHECK: udiv define i32 @_Z3fooPKcjj(ptr nocapture readnone %s, i32 %len, i32 %c) #0 { entry: diff --git a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll index 3c6b12dac2119..e006d9f6696ca 100644 --- a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll +++ b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll @@ -932,17 +932,17 @@ for.end: ; preds = %for.body, %entry define i16 @ult_multiuse_profit(i16 %n.raw, i8 %start) mustprogress { ; CHECK-LABEL: @ult_multiuse_profit( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = add i8 [[START:%.*]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i16 254 to i8 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i16 254 to i8 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[START]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[TMP2]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[UMAX:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP1]], i16 254) +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[START:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i16 +; CHECK-NEXT: [[UMAX:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP2]], i16 254) ; CHECK-NEXT: ret i16 [[UMAX]] ; entry: diff --git a/llvm/test/Transforms/IndVarSimplify/pr116483.ll b/llvm/test/Transforms/IndVarSimplify/pr116483.ll index ae108a525223e..093e25a3caa81 100644 --- a/llvm/test/Transforms/IndVarSimplify/pr116483.ll +++ b/llvm/test/Transforms/IndVarSimplify/pr116483.ll @@ -4,15 +4,15 @@ define i32 @test() { ; CHECK-LABEL: define i32 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[LOOP_BODY:.*]] +; CHECK: [[LOOP_BODY]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP_BODY]] +; CHECK: [[EXIT]]: ; CHECK-NEXT: [[XOR:%.*]] = xor i32 0, 3 ; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[XOR]], 329 ; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[MUL]] to i16 ; CHECK-NEXT: [[SEXT:%.*]] = shl i16 [[CONV]], 8 ; CHECK-NEXT: [[CONV1:%.*]] = ashr i16 [[SEXT]], 8 -; CHECK-NEXT: br label %[[LOOP_BODY:.*]] -; CHECK: [[LOOP_BODY]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP_BODY]] -; CHECK: [[EXIT]]: ; CHECK-NEXT: [[CONV3:%.*]] = zext i16 [[CONV1]] to i32 ; CHECK-NEXT: ret i32 [[CONV3]] ; diff --git a/llvm/test/Transforms/IndVarSimplify/pr63763.ll b/llvm/test/Transforms/IndVarSimplify/pr63763.ll index 4e62e92ca07ee..427db1e67410a 100644 --- a/llvm/test/Transforms/IndVarSimplify/pr63763.ll +++ b/llvm/test/Transforms/IndVarSimplify/pr63763.ll @@ -16,12 +16,12 @@ define i32 @test(i1 %c) { ; CHECK-NEXT: [[CONV2:%.*]] = ashr exact i32 [[SEXT]], 24 ; CHECK-NEXT: [[INVARIANT_OP:%.*]] = sub nsw i32 7, [[CONV2]] ; CHECK-NEXT: call void @use(i32 [[INVARIANT_OP]]) -; CHECK-NEXT: [[SEXT_US:%.*]] = shl i32 [[SEL]], 24 -; CHECK-NEXT: [[CONV2_US:%.*]] = ashr exact i32 [[SEXT_US]], 24 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: +; CHECK-NEXT: [[SEXT_US:%.*]] = shl i32 [[SEL]], 24 +; CHECK-NEXT: [[CONV2_US:%.*]] = ashr exact i32 [[SEXT_US]], 24 ; CHECK-NEXT: [[INVARIANT_OP_US:%.*]] = sub nsw i32 7, [[CONV2_US]] ; CHECK-NEXT: ret i32 [[INVARIANT_OP_US]] ; diff --git a/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll b/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll index 4692a542053c9..b3162de0f2245 100644 --- a/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll +++ b/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll @@ -4,20 +4,21 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" define i32 @remove_loop(i32 %size) { -; CHECK-LABEL: @remove_loop( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31 +; CHECK-LABEL: define i32 @remove_loop( +; CHECK-SAME: i32 [[SIZE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[WHILE_COND:.*]] +; CHECK: [[WHILE_COND]]: +; CHECK-NEXT: [[SIZE_ADDR_0:%.*]] = phi i32 [ [[SIZE]], %[[ENTRY]] ], [ [[SUB:%.*]], %[[WHILE_COND]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[SIZE_ADDR_0]], 31 +; CHECK-NEXT: [[SUB]] = add i32 [[SIZE_ADDR_0]], -32 +; CHECK-NEXT: br i1 [[CMP]], label %[[WHILE_COND]], label %[[WHILE_END:.*]] +; CHECK: [[WHILE_END]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], 31 ; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]] ; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 5 ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5 -; CHECK-NEXT: br label [[WHILE_COND:%.*]] -; CHECK: while.cond: -; CHECK-NEXT: [[SIZE_ADDR_0:%.*]] = phi i32 [ [[SIZE]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[WHILE_COND]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[SIZE_ADDR_0]], 31 -; CHECK-NEXT: [[SUB]] = add i32 [[SIZE_ADDR_0]], -32 -; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_COND]], label [[WHILE_END:%.*]] -; CHECK: while.end: ; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]] ; CHECK-NEXT: ret i32 [[TMP4]] ; diff --git a/llvm/test/Transforms/IndVarSimplify/sentinel.ll b/llvm/test/Transforms/IndVarSimplify/sentinel.ll index d1140affb5a4b..523414167956b 100644 --- a/llvm/test/Transforms/IndVarSimplify/sentinel.ll +++ b/llvm/test/Transforms/IndVarSimplify/sentinel.ll @@ -10,18 +10,18 @@ define void @test(i1 %arg) personality ptr @snork { ; CHECK-NEXT: br label [[BB4:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = add i32 [[INDVARS_IV:%.*]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = sub i32 [[TMP1:%.*]], [[SMAX:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[TMP6:%.*]], [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMAX:%.*]] ; CHECK-NEXT: br i1 [[ARG:%.*]], label [[BB2:%.*]], label [[BB4]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP0]], [[BB1:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[BB1:%.*]] ] ; CHECK-NEXT: ret void ; CHECK: bb4: ; CHECK-NEXT: [[INDVARS_IV]] = phi i32 [ [[INDVARS_IV_NEXT]], [[BB1]] ], [ undef, [[BB:%.*]] ] ; CHECK-NEXT: [[SMAX]] = call i32 @llvm.smax.i32(i32 [[INDVARS_IV]], i32 36) -; CHECK-NEXT: [[TMP6:%.*]] = invoke i32 @quux() [ "deopt"(i32 0, i32 0, i32 0, i32 180, i32 0, i32 25, i32 0, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 3, i32 [[INDVARS_IV]], i32 3, i32 undef, i32 7, ptr null, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 4, double undef, i32 7, ptr null, i32 4, i64 undef, i32 7, ptr null, i32 0, ptr addrspace(1) undef, i32 3, i32 undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 7, ptr null) ] +; CHECK-NEXT: [[TMP6]] = invoke i32 @quux() [ "deopt"(i32 0, i32 0, i32 0, i32 180, i32 0, i32 25, i32 0, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 3, i32 [[INDVARS_IV]], i32 3, i32 undef, i32 7, ptr null, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 4, double undef, i32 7, ptr null, i32 4, i64 undef, i32 7, ptr null, i32 0, ptr addrspace(1) undef, i32 3, i32 undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 7, ptr null) ] ; CHECK-NEXT: to label [[BB7:%.*]] unwind label [[BB15:%.*]] ; CHECK: bb7: -; CHECK-NEXT: [[TMP1]] = add i32 [[TMP6]], [[INDVARS_IV]] ; CHECK-NEXT: br label [[BB9:%.*]] ; CHECK: bb9: ; CHECK-NEXT: br i1 true, label [[BB1]], label [[BB9]] diff --git a/llvm/test/Transforms/Inline/AMDGPU/load-intrinsics.ll b/llvm/test/Transforms/Inline/AMDGPU/load-intrinsics.ll new file mode 100644 index 0000000000000..c95f61c89bfbf --- /dev/null +++ b/llvm/test/Transforms/Inline/AMDGPU/load-intrinsics.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 +; RUN: opt -mtriple=amdgcn --passes=inline --enable-noalias-to-md-conversion -S %s | FileCheck --check-prefix=OPT %s + +; This test tests if the load intrinsic gets correct memory(argmem: read) attribute and +; the call instruction is assigned correct !alias.scope metadata post inlining + +define void @caller(ptr addrspace(3) %addr_f, ptr addrspace(1) %use_f) { +; OPT-LABEL: define void @caller( +; OPT-SAME: ptr addrspace(3) [[ADDR_F:%.*]], ptr addrspace(1) [[USE_F:%.*]]) { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]]) +; OPT-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]]) +; OPT-NEXT: [[GEP_I:%.*]] = getelementptr i64, ptr addrspace(3) [[ADDR_F]], i32 4 +; OPT-NEXT: [[VAL_I:%.*]] = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) [[GEP_I]]), !alias.scope [[META0]], !noalias [[META3]] +; OPT-NEXT: store <2 x i32> [[VAL_I]], ptr addrspace(1) [[USE_F]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; OPT-NEXT: ret void +; +entry: + call void @callee(ptr addrspace(3) %addr_f, ptr addrspace(1) %use_f) + ret void +} + +define void @callee(ptr addrspace(3) noalias %addr, ptr addrspace(1) noalias %use) { +; OPT-LABEL: define void @callee( +; OPT-SAME: ptr addrspace(3) noalias [[ADDR:%.*]], ptr addrspace(1) noalias [[USE:%.*]]) { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: [[GEP:%.*]] = getelementptr i64, ptr addrspace(3) [[ADDR]], i32 4 +; OPT-NEXT: [[VAL:%.*]] = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) [[GEP]]) +; OPT-NEXT: store <2 x i32> [[VAL]], ptr addrspace(1) [[USE]], align 8 +; OPT-NEXT: ret void +; +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3) %gep) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} +;. +; Check Function Attribute on decl +; OPT: declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) nocapture) #[[ATTR0:[0-9]+]] +declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3)) +; OPT: attributes #[[ATTR0]] = { convergent nocallback nofree nounwind willreturn memory(argmem: read) } +; OPT: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } +;. +; OPT: [[META0]] = !{[[META1:![0-9]+]]} +; OPT: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]], !"callee: %addr"} +; OPT: [[META2]] = distinct !{[[META2]], !"callee"} +; OPT: [[META3]] = !{[[META4:![0-9]+]]} +; OPT: [[META4]] = distinct !{[[META4]], [[META2]], !"callee: %use"} +;. diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readfirstlane.ll new file mode 100644 index 0000000000000..60561459e3f11 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -0,0 +1,675 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 -passes=instcombine -S < %s | FileCheck %s + +; test unary + +define float @hoist_fneg_f32(float %arg) { +; CHECK-LABEL: define float @hoist_fneg_f32( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fneg float [[TMP0]] +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fneg float %arg + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define double @hoist_fneg_f64(double %arg) { +; CHECK-LABEL: define double @hoist_fneg_f64( +; CHECK-SAME: double [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fneg double [[TMP0]] +; CHECK-NEXT: ret double [[RFL]] +; +bb: + %val = fneg double %arg + %rfl = call double @llvm.amdgcn.readfirstlane.f64(double %val) + ret double %rfl +} + +; test casts + +define i32 @hoist_trunc(i64 %arg) { +; CHECK-LABEL: define i32 @hoist_trunc( +; CHECK-SAME: i64 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[RFL]] to i32 +; CHECK-NEXT: ret i32 [[TMP0]] +; +bb: + %val = trunc i64 %arg to i32 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i64 @hoist_zext(i32 %arg) { +; CHECK-LABEL: define i64 @hoist_zext( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[RFL]] to i64 +; CHECK-NEXT: ret i64 [[TMP0]] +; +bb: + %val = zext i32 %arg to i64 + %rfl = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %val) + ret i64 %rfl +} + +define i64 @hoist_sext(i32 %arg) { +; CHECK-LABEL: define i64 @hoist_sext( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[RFL]] to i64 +; CHECK-NEXT: ret i64 [[TMP0]] +; +bb: + %val = zext i32 %arg to i64 + %rfl = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %val) + ret i64 %rfl +} + +define i32 @hoist_fptoui(float %arg) { +; CHECK-LABEL: define i32 @hoist_fptoui( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = fptoui float [[RFL]] to i32 +; CHECK-NEXT: ret i32 [[TMP0]] +; +bb: + %val = fptoui float %arg to i32 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_fptosi(float %arg) { +; CHECK-LABEL: define i32 @hoist_fptosi( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = fptosi float [[RFL]] to i32 +; CHECK-NEXT: ret i32 [[TMP0]] +; +bb: + %val = fptosi float %arg to i32 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_uitofp(i32 %arg) { +; CHECK-LABEL: define float @hoist_uitofp( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = uitofp i32 [[RFL]] to float +; CHECK-NEXT: ret float [[TMP0]] +; +bb: + %val = uitofp i32 %arg to float + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define float @hoist_sitofp(i32 %arg) { +; CHECK-LABEL: define float @hoist_sitofp( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = sitofp i32 [[RFL]] to float +; CHECK-NEXT: ret float [[TMP0]] +; +bb: + %val = sitofp i32 %arg to float + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define float @hoist_fptrunc(double %arg) { +; CHECK-LABEL: define float @hoist_fptrunc( +; CHECK-SAME: double [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = fptrunc double [[RFL]] to float +; CHECK-NEXT: ret float [[TMP0]] +; +bb: + %val = fptrunc double %arg to float + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define float @hoist_fpext(half %arg) { +; CHECK-LABEL: define float @hoist_fpext( +; CHECK-SAME: half [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call half @llvm.amdgcn.readfirstlane.f16(half [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = fpext half [[RFL]] to float +; CHECK-NEXT: ret float [[TMP0]] +; +bb: + %val = fpext half %arg to float + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define i64 @hoist_ptrtoint(ptr %arg) { +; CHECK-LABEL: define i64 @hoist_ptrtoint( +; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[RFL]] to i64 +; CHECK-NEXT: ret i64 [[TMP0]] +; +bb: + %val = ptrtoint ptr %arg to i64 + %rfl = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %val) + ret i64 %rfl +} + +define ptr @hoist_inttoptr(i64 %arg) { +; CHECK-LABEL: define ptr @hoist_inttoptr( +; CHECK-SAME: i64 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = inttoptr i64 [[RFL]] to ptr +; CHECK-NEXT: ret ptr [[TMP0]] +; +bb: + %val = inttoptr i64 %arg to ptr + %rfl = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %val) + ret ptr %rfl +} + +define float @hoist_bitcast(i32 %arg) { +; CHECK-LABEL: define float @hoist_bitcast( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[RFL]] to float +; CHECK-NEXT: ret float [[TMP0]] +; +bb: + %val = bitcast i32 %arg to float + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define ptr addrspace(1) @hoist_addrspacecast(ptr addrspace(0) %arg) { +; CHECK-LABEL: define ptr addrspace(1) @hoist_addrspacecast( +; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[RFL]] to ptr addrspace(1) +; CHECK-NEXT: ret ptr addrspace(1) [[TMP0]] +; +bb: + %val = addrspacecast ptr addrspace(0) %arg to ptr addrspace(1) + %rfl = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) %val) + ret ptr addrspace(1) %rfl +} + +; test binary i32 + +define i32 @hoist_add_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_add_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = add i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_fadd_f32(float %arg) { +; CHECK-LABEL: define float @hoist_fadd_f32( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fadd float [[TMP0]], 1.280000e+02 +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fadd float %arg, 128.0 + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define i32 @hoist_sub_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_sub_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP0]], -16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = sub i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_fsub_f32(float %arg) { +; CHECK-LABEL: define float @hoist_fsub_f32( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fadd float [[TMP0]], -1.280000e+02 +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fsub float %arg, 128.0 + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define i32 @hoist_mul_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_mul_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = mul i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = mul i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_fmul_f32(float %arg) { +; CHECK-LABEL: define float @hoist_fmul_f32( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fmul float [[TMP0]], 1.280000e+02 +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fmul float %arg, 128.0 + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define i32 @hoist_udiv_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_udiv_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = udiv i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = udiv i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_sdiv_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_sdiv_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = sdiv i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = sdiv i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_fdiv_f32(float %arg) { +; CHECK-LABEL: define float @hoist_fdiv_f32( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fmul float [[TMP0]], 7.812500e-03 +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fdiv float %arg, 128.0 + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define i32 @hoist_urem_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_urem_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = urem i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = urem i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_srem_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_srem_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = srem i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = srem i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_frem_f32(float %arg) { +; CHECK-LABEL: define float @hoist_frem_f32( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = frem float [[TMP0]], 1.280000e+02 +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = frem float %arg, 128.0 + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define i32 @hoist_shl_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_shl_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = shl i32 [[TMP0]], 4 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = shl i32 %arg, 4 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_lshr_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_lshr_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = lshr i32 [[TMP0]], 4 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = lshr i32 %arg, 4 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_ashr_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_ashr_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = ashr i32 [[TMP0]], 4 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = ashr i32 %arg, 4 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + + +define i32 @hoist_and_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_and_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = and i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = and i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_or_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_or_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = or i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = or i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_xor_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_xor_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = xor i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = xor i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +; test binary i64 + +define i64 @hoist_and_i64(i64 %arg) { +; CHECK-LABEL: define i64 @hoist_and_i64( +; CHECK-SAME: i64 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = and i64 [[TMP0]], 16777215 +; CHECK-NEXT: ret i64 [[RFL]] +; +bb: + %val = and i64 %arg, 16777215 + %rfl = call i64 @llvm.amdgcn.readfirstlane.i32(i64 %val) + ret i64 %rfl +} + +define double @hoist_fadd_f64(double %arg) { +; CHECK-LABEL: define double @hoist_fadd_f64( +; CHECK-SAME: double [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fadd double [[TMP0]], 1.280000e+02 +; CHECK-NEXT: ret double [[RFL]] +; +bb: + %val = fadd double %arg, 128.0 + %rfl = call double @llvm.amdgcn.readfirstlane.f64(double %val) + ret double %rfl +} + +; test constant on LHS + +define i32 @hoist_sub_i32_lhs(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_sub_i32_lhs( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = sub i32 16777215, [[TMP0]] +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = sub i32 16777215, %arg + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_fsub_f32_lhs(float %arg) { +; CHECK-LABEL: define float @hoist_fsub_f32_lhs( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fsub float 1.280000e+02, [[TMP0]] +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fsub float 128.0, %arg + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +; test other operand is trivially uniform + +define i32 @hoist_add_i32_trivially_uniform_rhs(i32 %arg, i32 %v.other) { +; CHECK-LABEL: define i32 @hoist_add_i32_trivially_uniform_rhs( +; CHECK-SAME: i32 [[ARG:%.*]], i32 [[V_OTHER:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[OTHER:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V_OTHER]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP0]], [[OTHER]] +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %other = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %v.other) + %val = add i32 %arg, %other + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_add_i32_trivially_uniform_lhs(i32 %arg, i32 %v.other) { +; CHECK-LABEL: define i32 @hoist_add_i32_trivially_uniform_lhs( +; CHECK-SAME: i32 [[ARG:%.*]], i32 [[V_OTHER:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[OTHER:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V_OTHER]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = sub i32 [[OTHER]], [[TMP0]] +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %other = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %v.other) + %val = sub i32 %other, %arg + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +; test multiple iterations + +define i32 @hoist_multiple_times(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_multiple_times( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 16777215, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 4242 +; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP3]], 6 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val.0 = shl i32 %arg, 2 + %val.1 = sub i32 16777215, %val.0 + %val.2 = xor i32 %val.1, 4242 + %val.3 = add i32 %val.2, 6 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val.3) + ret i32 %rfl +} + +; test cases where hoisting isn't possible + +define i32 @cross_block_hoisting(i1 %cond, i32 %arg) { +; CHECK-LABEL: define i32 @cross_block_hoisting( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[VAL:%.*]] = add i32 [[ARG]], 16777215 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[VAL]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[RFL]], %[[THEN]] ], [ [[VAL]], %[[BB]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +bb: + %val = add i32 %arg, 16777215 + br i1 %cond, label %then, label %end + +then: + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + br label %end + +end: + %res = phi i32 [%rfl, %then], [%val, %bb] + ret i32 %res +} + +define i32 @operand_is_instr(i32 %arg, ptr %src) { +; CHECK-LABEL: define i32 @operand_is_instr( +; CHECK-SAME: i32 [[ARG:%.*]], ptr [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[OTHER:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: [[VAL:%.*]] = add i32 [[ARG]], [[OTHER]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[VAL]]) +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %other = load i32, ptr %src + %val = add i32 %arg, %other + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @operand_is_arg(i32 %arg, i32 %other) { +; CHECK-LABEL: define i32 @operand_is_arg( +; CHECK-SAME: i32 [[ARG:%.*]], i32 [[OTHER:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[VAL:%.*]] = add i32 [[ARG]], [[OTHER]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[VAL]]) +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = add i32 %arg, %other + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +; test that convergence tokens are preserved + +define i32 @hoist_preserves_convergence_token(i1 %cond, i32 %arg) convergent { +; CHECK-LABEL: define i32 @hoist_preserves_convergence_token( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[ARG:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[ENTRY:%.*]] = call token @llvm.experimental.convergence.entry() +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) [ "convergencectrl"(token [[ENTRY]]) ] +; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP0]], 16777215 +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[RFL]], %[[THEN]] ], [ [[ARG]], %[[BB]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +bb: + %entry = call token @llvm.experimental.convergence.entry() + br i1 %cond, label %then, label %end + +then: + %val = add i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) [ "convergencectrl"(token %entry)] + br label %end + +end: + %res = phi i32 [%rfl, %then], [%arg, %bb] + ret i32 %res +} diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readlane.ll new file mode 100644 index 0000000000000..a9ac4bc93fd3c --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readlane.ll @@ -0,0 +1,232 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 -passes=instcombine -S < %s | FileCheck %s + +; The readfirstlane version of this test covers all the interesting cases of the +; shared logic. This testcase focuses on readlane specific pitfalls. + +; test unary + +define float @hoist_fneg_f32(float %arg, i32 %lane) { +; CHECK-LABEL: define float @hoist_fneg_f32( +; CHECK-SAME: float [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = fneg float [[RL]] +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fneg float %arg + %rl = call float @llvm.amdgcn.readlane.f32(float %val, i32 %lane) + ret float %rl +} + +define double @hoist_fneg_f64(double %arg, i32 %lane) { +; CHECK-LABEL: define double @hoist_fneg_f64( +; CHECK-SAME: double [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = fneg double [[RL]] +; CHECK-NEXT: ret double [[RFL]] +; +bb: + %val = fneg double %arg + %rl = call double @llvm.amdgcn.readlane.f64(double %val, i32 %lane) + ret double %rl +} + +; test casts + +define i32 @hoist_trunc(i64 %arg, i32 %lane) { +; CHECK-LABEL: define i32 @hoist_trunc( +; CHECK-SAME: i64 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call i64 @llvm.amdgcn.readlane.i64(i64 [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[RL]] to i32 +; CHECK-NEXT: ret i32 [[TMP0]] +; +bb: + %val = trunc i64 %arg to i32 + %rl = call i32 @llvm.amdgcn.readlane.i32(i32 %val, i32 %lane) + ret i32 %rl +} + +define i64 @hoist_zext(i32 %arg, i32 %lane) { +; CHECK-LABEL: define i64 @hoist_zext( +; CHECK-SAME: i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[RL]] to i64 +; CHECK-NEXT: ret i64 [[TMP0]] +; +bb: + %val = zext i32 %arg to i64 + %rl = call i64 @llvm.amdgcn.readlane.i64(i64 %val, i32 %lane) + ret i64 %rl +} + +; test binary i32 + +define i32 @hoist_add_i32(i32 %arg, i32 %lane) { +; CHECK-LABEL: define i32 @hoist_add_i32( +; CHECK-SAME: i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = add i32 [[RL]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = add i32 %arg, 16777215 + %rl = call i32 @llvm.amdgcn.readlane.i32(i32 %val, i32 %lane) + ret i32 %rl +} + +define float @hoist_fadd_f32(float %arg, i32 %lane) { +; CHECK-LABEL: define float @hoist_fadd_f32( +; CHECK-SAME: float [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = fadd float [[RL]], 1.280000e+02 +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fadd float %arg, 128.0 + %rl = call float @llvm.amdgcn.readlane.f32(float %val, i32 %lane) + ret float %rl +} + +; test binary i64 + +define i64 @hoist_and_i64(i64 %arg, i32 %lane) { +; CHECK-LABEL: define i64 @hoist_and_i64( +; CHECK-SAME: i64 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call i64 @llvm.amdgcn.readlane.i64(i64 [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = and i64 [[RL]], 16777215 +; CHECK-NEXT: ret i64 [[RFL]] +; +bb: + %val = and i64 %arg, 16777215 + %rl = call i64 @llvm.amdgcn.readlane.i32(i64 %val, i32 %lane) + ret i64 %rl +} + +define double @hoist_fadd_f64(double %arg, i32 %lane) { +; CHECK-LABEL: define double @hoist_fadd_f64( +; CHECK-SAME: double [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = fadd double [[RL]], 1.280000e+02 +; CHECK-NEXT: ret double [[RFL]] +; +bb: + %val = fadd double %arg, 128.0 + %rl = call double @llvm.amdgcn.readlane.f64(double %val, i32 %lane) + ret double %rl +} + +; test constant on LHS + +define i32 @hoist_sub_i32_lhs(i32 %arg, i32 %lane) { +; CHECK-LABEL: define i32 @hoist_sub_i32_lhs( +; CHECK-SAME: i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = sub i32 16777215, [[RL]] +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = sub i32 16777215, %arg + %rl = call i32 @llvm.amdgcn.readlane.i32(i32 %val, i32 %lane) + ret i32 %rl +} + +define float @hoist_fsub_f32_lhs(float %arg, i32 %lane) { +; CHECK-LABEL: define float @hoist_fsub_f32_lhs( +; CHECK-SAME: float [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = fsub float 1.280000e+02, [[RL]] +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fsub float 128.0, %arg + %rl = call float @llvm.amdgcn.readlane.f32(float %val, i32 %lane) + ret float %rl +} + +define i32 @readlane_lane_op_in_other_block(i1 %cond, i32 %arg, i32 %base) { +; CHECK-LABEL: define i32 @readlane_lane_op_in_other_block( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[ARG:%.*]], i32 [[BASE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[LANE:%.*]] = add i32 [[BASE]], 2 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[RL]], 16777215 +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[TMP0]], %[[THEN]] ], [ [[LANE]], %[[BB]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +bb: + %lane = add i32 %base, 2 + br i1 %cond, label %then, label %end + +then: + %val = add i32 %arg, 16777215 + %rl = call i32 @llvm.amdgcn.readlane.i32(i32 %val, i32 %lane) + br label %end + +end: + %res = phi i32 [%rl, %then], [%lane, %bb] + ret i32 %res +} + +; Check cases where we can't move the readlane higher + +define float @cannot_move_readlane(float %arg, i32 %base) { +; CHECK-LABEL: define float @cannot_move_readlane( +; CHECK-SAME: float [[ARG:%.*]], i32 [[BASE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[VAL:%.*]] = fsub float 1.280000e+02, [[ARG]] +; CHECK-NEXT: [[LANE:%.*]] = add i32 [[BASE]], 2 +; CHECK-NEXT: [[RFL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL]], i32 [[LANE]]) +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fsub float 128.0, %arg + %lane = add i32 %base, 2 + %rl = call float @llvm.amdgcn.readlane.f32(float %val, i32 %lane) + ret float %rl +} + + +; test that convergence tokens are preserved + +define i32 @hoist_preserves_convergence_token(i1 %cond, i32 %arg, i32 %lane) convergent { +; CHECK-LABEL: define i32 @hoist_preserves_convergence_token( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[ENTRY:%.*]] = call token @llvm.experimental.convergence.entry() +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) [ "convergencectrl"(token [[ENTRY]]) ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[RL]], 16777215 +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[TMP0]], %[[THEN]] ], [ [[ARG]], %[[BB]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +bb: + %entry = call token @llvm.experimental.convergence.entry() + br i1 %cond, label %then, label %end + +then: + %val = add i32 %arg, 16777215 + %rl = call i32 @llvm.amdgcn.readlane.i32(i32 %val, i32 %lane) [ "convergencectrl"(token %entry)] + br label %end + +end: + %res = phi i32 [%rl, %then], [%arg, %bb] + ret i32 %res +} diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/phi-with-incoming-from-load.ll b/llvm/test/Transforms/InstCombine/AMDGPU/phi-with-incoming-from-load.ll new file mode 100644 index 0000000000000..14fb45e43af86 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/phi-with-incoming-from-load.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=instcombine -S -o - %s | FileCheck %s + +target triple = "amdgcn-amd-amdhsa" + +%double_double = type { double, double } + +declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1) + +define void @_test(ptr addrspace(4) byref(%double_double) align 8 %in) { +; CHECK-LABEL: define void @_test( +; CHECK-SAME: ptr addrspace(4) byref([[DOUBLE_DOUBLE:%.*]]) align 8 [[IN:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ALPHA_UNION:%.*]] = addrspacecast ptr addrspace(4) [[IN]] to ptr +; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(5) null, align 1 +; CHECK-NEXT: [[LOADEDV:%.*]] = trunc i8 [[LOAD]] to i1 +; CHECK-NEXT: br i1 [[LOADEDV]], label %[[COND_END:.*]], label %[[COND_FALSE:.*]] +; CHECK: [[COND_FALSE]]: +; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(4) [[IN]], align 8 +; CHECK-NEXT: br label %[[COND_END]] +; CHECK: [[COND_END]]: +; CHECK-NEXT: [[COND1:%.*]] = phi ptr [ [[TMP0]], %[[COND_FALSE]] ], [ [[ALPHA_UNION]], %[[ENTRY]] ] +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(16) poison, ptr noundef nonnull align 1 dereferenceable(16) [[COND1]], i64 16, i1 false) +; CHECK-NEXT: ret void +; +entry: + %coerce = alloca %double_double, align 8, addrspace(5) + %alpha_union = addrspacecast ptr addrspace(5) %coerce to ptr + call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %coerce, ptr addrspace(4) align 8 %in, i64 16, i1 false) + %load1 = load i8, ptr addrspace(5) null, align 1 + %loadedv = trunc i8 %load1 to i1 + br i1 %loadedv, label %cond.end, label %cond.false + +cond.false: + %load2 = load ptr, ptr addrspace(5) %coerce, align 8 + br label %cond.end + +cond.end: + %cond = phi ptr [ %load2, %cond.false ], [ %alpha_union, %entry ] + call void @llvm.memcpy.p0.p0.i64(ptr poison, ptr %cond, i64 16, i1 false) + ret void +} diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll new file mode 100644 index 0000000000000..beb84362b7f92 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S < %s | FileCheck %s + +%struct.type = type { [256 x <2 x i64>] } +@g1 = external hidden addrspace(3) global %struct.type, align 16 + +; This test requires the PtrReplacer to replace users in an RPO traversal. +; Furthermore, %ptr.else need not to be replaced so it must be retained in +; %ptr.sink. +define <2 x i64> @func(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) { +; CHECK-LABEL: define <2 x i64> @func( +; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[CMP_0]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[VAL_THEN:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr +; CHECK-NEXT: br label %[[SINK:.*]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: [[PTR_ELSE:%.*]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16 +; CHECK-NEXT: br label %[[SINK]] +; CHECK: [[SINK]]: +; CHECK-NEXT: [[PTR_SINK:%.*]] = phi ptr [ [[PTR_ELSE]], %[[IF_ELSE]] ], [ [[VAL_THEN]], %[[IF_THEN]] ] +; CHECK-NEXT: [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_SINK]], align 16 +; CHECK-NEXT: ret <2 x i64> [[VAL_SINK]] +; +entry: + %coerce = alloca %struct.type, align 16, addrspace(5) + call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false) + br i1 %cmp.0, label %if.then, label %if.else + +if.then: ; preds = %entry + %ptr.then = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0 + %val.then = addrspacecast ptr addrspace(5) %ptr.then to ptr + br label %sink + +if.else: ; preds = %entry + %ptr.else = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16 + %val.else = getelementptr inbounds nuw i8, ptr %ptr.else, i64 0 + br label %sink + +sink: + %ptr.sink = phi ptr [ %val.else, %if.else ], [ %val.then, %if.then ] + %val.sink = load <2 x i64>, ptr %ptr.sink, align 16 + ret <2 x i64> %val.sink +} + +define <2 x i64> @func_phi_loop(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) { +; CHECK-LABEL: define <2 x i64> @func_phi_loop( +; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[VAL_0:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[PTR_PHI_R:%.*]] = phi ptr [ [[PTR_1:%.*]], %[[LOOP]] ], [ [[VAL_0]], %[[ENTRY]] ] +; CHECK-NEXT: [[PTR_1]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16 +; CHECK-NEXT: br i1 [[CMP_0]], label %[[LOOP]], label %[[SINK:.*]] +; CHECK: [[SINK]]: +; CHECK-NEXT: [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_PHI_R]], align 16 +; CHECK-NEXT: ret <2 x i64> [[VAL_SINK]] +; +entry: + %coerce = alloca %struct.type, align 16, addrspace(5) + call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false) + %ptr.0 = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0 + %val.0 = addrspacecast ptr addrspace(5) %ptr.0 to ptr + br label %loop + +loop: + %ptr.phi = phi ptr [ %val.1, %loop ], [ %val.0, %entry ] + %ptr.1 = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16 + %val.1 = getelementptr inbounds nuw i8, ptr %ptr.1, i64 0 + br i1 %cmp.0, label %loop, label %sink + +sink: + %val.sink = load <2 x i64>, ptr %ptr.phi, align 16 + ret <2 x i64> %val.sink +} + +; Crashed in IC PtrReplacer because an invalid select was generated with addrspace(4) and addrspace(5) +; operands. +define amdgpu_kernel void @select_addr4_addr5(ptr addrspace(4) byref([12 x i8]) align 16 %arg) { +; CHECK-LABEL: define amdgpu_kernel void @select_addr4_addr5( +; CHECK-SAME: ptr addrspace(4) byref([12 x i8]) align 16 [[ARG:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: ret void +; +bb: + %alloca = alloca i32, i32 0, align 8, addrspace(5) + %alloca1 = alloca [12 x i8], align 16, addrspace(5) + call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca1, ptr addrspace(4) %arg, i64 0, i1 false) + %select = select i1 false, ptr addrspace(5) %alloca1, ptr addrspace(5) %alloca + call void @llvm.memcpy.p0.p5.i64(ptr null, ptr addrspace(5) %select, i64 0, i1 false) + ret void +} + +; Same as above but with swapped operands on the select. +define amdgpu_kernel void @select_addr4_addr5_swapped(ptr addrspace(4) byref([12 x i8]) align 16 %arg) { +; CHECK-LABEL: define amdgpu_kernel void @select_addr4_addr5_swapped( +; CHECK-SAME: ptr addrspace(4) byref([12 x i8]) align 16 [[ARG:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: ret void +; +bb: + %alloca = alloca i32, i32 0, align 8, addrspace(5) + %alloca1 = alloca [12 x i8], align 16, addrspace(5) + call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca1, ptr addrspace(4) %arg, i64 0, i1 false) + %select = select i1 false, ptr addrspace(5) %alloca, ptr addrspace(5) %alloca1 + call void @llvm.memcpy.p0.p5.i64(ptr null, ptr addrspace(5) %select, i64 0, i1 false) + ret void +} + +declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias writeonly captures(none), ptr addrspace(4) noalias readonly captures(none), i64, i1 immarg) #0 diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll new file mode 100644 index 0000000000000..1082c6ddb898b --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll @@ -0,0 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s + +@gv = constant i32 0 + +define float @cvt_var(i32 %a) { +; CHECK-LABEL: define float @cvt_var( +; CHECK-SAME: i32 [[A:%.*]]) { +; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 [[A]]) +; CHECK-NEXT: ret float [[RET]] +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 %a) + ret float %ret +} + +define float @cvt_imm_0() { +; CHECK-LABEL: define float @cvt_imm_0() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 0) + ret float %ret +} + +define float @cvt_imm_1() { +; CHECK-LABEL: define float @cvt_imm_1() { +; CHECK-NEXT: ret float 6.250000e-02 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 1) + ret float %ret +} + +define float @cvt_imm_2() { +; CHECK-LABEL: define float @cvt_imm_2() { +; CHECK-NEXT: ret float 1.250000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 2) + ret float %ret +} + +define float @cvt_imm_3() { +; CHECK-LABEL: define float @cvt_imm_3() { +; CHECK-NEXT: ret float 1.875000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 3) + ret float %ret +} + +define float @cvt_imm_4() { +; CHECK-LABEL: define float @cvt_imm_4() { +; CHECK-NEXT: ret float 2.500000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 4) + ret float %ret +} + +define float @cvt_imm_5() { +; CHECK-LABEL: define float @cvt_imm_5() { +; CHECK-NEXT: ret float 3.125000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 5) + ret float %ret +} + +define float @cvt_imm_6() { +; CHECK-LABEL: define float @cvt_imm_6() { +; CHECK-NEXT: ret float 3.750000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 6) + ret float %ret +} + +define float @cvt_imm_7() { +; CHECK-LABEL: define float @cvt_imm_7() { +; CHECK-NEXT: ret float 4.375000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 7) + ret float %ret +} + +define float @cvt_imm_8() { +; CHECK-LABEL: define float @cvt_imm_8() { +; CHECK-NEXT: ret float -5.000000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 8) + ret float %ret +} + +define float @cvt_imm_9() { +; CHECK-LABEL: define float @cvt_imm_9() { +; CHECK-NEXT: ret float -4.375000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 9) + ret float %ret +} + +define float @cvt_imm_10() { +; CHECK-LABEL: define float @cvt_imm_10() { +; CHECK-NEXT: ret float -3.750000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 10) + ret float %ret +} + +define float @cvt_imm_11() { +; CHECK-LABEL: define float @cvt_imm_11() { +; CHECK-NEXT: ret float -3.125000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 11) + ret float %ret +} + +define float @cvt_imm_12() { +; CHECK-LABEL: define float @cvt_imm_12() { +; CHECK-NEXT: ret float -2.500000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 12) + ret float %ret +} + +define float @cvt_imm_13() { +; CHECK-LABEL: define float @cvt_imm_13() { +; CHECK-NEXT: ret float -1.875000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 13) + ret float %ret +} + +define float @cvt_imm_14() { +; CHECK-LABEL: define float @cvt_imm_14() { +; CHECK-NEXT: ret float -1.250000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 14) + ret float %ret +} + +define float @cvt_imm_15() { +; CHECK-LABEL: define float @cvt_imm_15() { +; CHECK-NEXT: ret float -6.250000e-02 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 15) + ret float %ret +} + +define float @cvt_imm_underflow() { +; CHECK-LABEL: define float @cvt_imm_underflow() { +; CHECK-NEXT: ret float -6.250000e-02 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 -1) + ret float %ret +} + +define float @cvt_imm_overflow() { +; CHECK-LABEL: define float @cvt_imm_overflow() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 16) + ret float %ret +} + +define float @cvt_poison() { +; CHECK-LABEL: define float @cvt_poison() { +; CHECK-NEXT: ret float poison +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 poison) + ret float %ret +} + +define float @cvt_undef() { +; CHECK-LABEL: define float @cvt_undef() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 undef) + ret float %ret +} + +define float @cvt_constexpr() { +; CHECK-LABEL: define float @cvt_constexpr() { +; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 ptrtoint (ptr @gv to i32)) +; CHECK-NEXT: ret float [[RET]] +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 ptrtoint (ptr @gv to i32)) + ret float %ret +} diff --git a/llvm/test/Transforms/InstCombine/or-packed-int-vecs.ll b/llvm/test/Transforms/InstCombine/or-packed-int-vecs.ll new file mode 100644 index 0000000000000..9391fb5ddae97 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/or-packed-int-vecs.ll @@ -0,0 +1,926 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt %s -passes=instcombine -data-layout="E" -S | FileCheck %s --check-prefixes=CHECK,CHECK-BE +; RUN: opt %s -passes=instcombine -data-layout="e" -S | FileCheck %s --check-prefixes=CHECK,CHECK-LE + +define i32 @bitcast.v2i.le(<4 x i8> %v) { +; CHECK-BE-LABEL: define i32 @bitcast.v2i.le( +; CHECK-BE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <4 x i8> [[V]], i64 0 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <4 x i8> [[V]], i64 1 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <4 x i8> [[V]], i64 2 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-BE-NEXT: [[X_2:%.*]] = or disjoint i32 [[X_1]], [[S_2]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <4 x i8> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_2]], [[S_3]] +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @bitcast.v2i.le( +; CHECK-LE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3:%.*]] = bitcast <4 x i8> [[V]] to i32 +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <4 x i8> %v, i64 0 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <4 x i8> %v, i64 1 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.2 = extractelement <4 x i8> %v, i64 2 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + %x.2 = or i32 %x.1, %s.2 + + %v.3 = extractelement <4 x i8> %v, i64 3 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.2, %s.3 + + ret i32 %x.3 +} + +define i32 @bitcast.v2i.be(<4 x i8> %v) { +; CHECK-BE-LABEL: define i32 @bitcast.v2i.be( +; CHECK-BE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3:%.*]] = bitcast <4 x i8> [[V]] to i32 +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @bitcast.v2i.be( +; CHECK-LE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <4 x i8> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <4 x i8> [[V]], i64 2 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <4 x i8> [[V]], i64 1 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-LE-NEXT: [[X_2:%.*]] = or disjoint i32 [[X_1]], [[S_2]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <4 x i8> [[V]], i64 0 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_2]], [[S_3]] +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <4 x i8> %v, i64 3 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <4 x i8> %v, i64 2 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.2 = extractelement <4 x i8> %v, i64 1 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + %x.2 = or i32 %x.1, %s.2 + + %v.3 = extractelement <4 x i8> %v, i64 0 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.2, %s.3 + + ret i32 %x.3 +} + +define i64 @bitcast.v2i.le.i16(<4 x i16> %v) { +; CHECK-BE-LABEL: define i64 @bitcast.v2i.le.i16( +; CHECK-BE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <4 x i16> [[V]], i64 0 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <4 x i16> [[V]], i64 1 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <4 x i16> [[V]], i64 2 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-BE-NEXT: [[X_2:%.*]] = or disjoint i64 [[X_1]], [[S_2]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <4 x i16> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_2]], [[S_3]] +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @bitcast.v2i.le.i16( +; CHECK-LE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3:%.*]] = bitcast <4 x i16> [[V]] to i64 +; CHECK-LE-NEXT: ret i64 [[X_3]] +; + %v.0 = extractelement <4 x i16> %v, i64 0 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <4 x i16> %v, i64 1 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <4 x i16> %v, i64 2 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + %x.2 = or i64 %x.1, %s.2 + + %v.3 = extractelement <4 x i16> %v, i64 3 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.2, %s.3 + + ret i64 %x.3 +} + +define i64 @bitcast.v2i.be.i16(<4 x i16> %v) { +; CHECK-BE-LABEL: define i64 @bitcast.v2i.be.i16( +; CHECK-BE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3:%.*]] = bitcast <4 x i16> [[V]] to i64 +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @bitcast.v2i.be.i16( +; CHECK-LE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <4 x i16> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <4 x i16> [[V]], i64 2 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <4 x i16> [[V]], i64 1 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-LE-NEXT: [[X_2:%.*]] = or disjoint i64 [[X_1]], [[S_2]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <4 x i16> [[V]], i64 0 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_2]], [[S_3]] +; CHECK-LE-NEXT: ret i64 [[X_3]] +; + %v.0 = extractelement <4 x i16> %v, i64 3 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <4 x i16> %v, i64 2 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <4 x i16> %v, i64 1 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + %x.2 = or i64 %x.1, %s.2 + + %v.3 = extractelement <4 x i16> %v, i64 0 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.2, %s.3 + + ret i64 %x.3 +} + +define i32 @bitcast.v2i.le.tree(<4 x i8> %v) { +; CHECK-BE-LABEL: define i32 @bitcast.v2i.le.tree( +; CHECK-BE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <4 x i8> [[V]], i64 0 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <4 x i8> [[V]], i64 1 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <4 x i8> [[V]], i64 2 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <4 x i8> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i32 [[S_2]], [[S_3]] +; CHECK-BE-NEXT: [[X:%.*]] = or disjoint i32 [[X_1]], [[X_3]] +; CHECK-BE-NEXT: ret i32 [[X]] +; +; CHECK-LE-LABEL: define i32 @bitcast.v2i.le.tree( +; CHECK-LE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X:%.*]] = bitcast <4 x i8> [[V]] to i32 +; CHECK-LE-NEXT: ret i32 [[X]] +; + %v.0 = extractelement <4 x i8> %v, i64 0 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <4 x i8> %v, i64 1 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.2 = extractelement <4 x i8> %v, i64 2 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + + %v.3 = extractelement <4 x i8> %v, i64 3 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %s.2, %s.3 + + %x = or i32 %x.1, %x.3 + + ret i32 %x +} + +define i32 @bitcast.v2i.be.tree(<4 x i8> %v) { +; CHECK-BE-LABEL: define i32 @bitcast.v2i.be.tree( +; CHECK-BE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X:%.*]] = bitcast <4 x i8> [[V]] to i32 +; CHECK-BE-NEXT: ret i32 [[X]] +; +; CHECK-LE-LABEL: define i32 @bitcast.v2i.be.tree( +; CHECK-LE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <4 x i8> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <4 x i8> [[V]], i64 2 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <4 x i8> [[V]], i64 1 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <4 x i8> [[V]], i64 0 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i32 [[S_2]], [[S_3]] +; CHECK-LE-NEXT: [[X:%.*]] = or disjoint i32 [[X_1]], [[X_3]] +; CHECK-LE-NEXT: ret i32 [[X]] +; + %v.0 = extractelement <4 x i8> %v, i64 3 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <4 x i8> %v, i64 2 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.2 = extractelement <4 x i8> %v, i64 1 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + + %v.3 = extractelement <4 x i8> %v, i64 0 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %s.2, %s.3 + + %x = or i32 %x.1, %x.3 + + ret i32 %x +} + +define i64 @bitcast.v2i.le.tree.i16(<4 x i16> %v) { +; CHECK-BE-LABEL: define i64 @bitcast.v2i.le.tree.i16( +; CHECK-BE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <4 x i16> [[V]], i64 0 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <4 x i16> [[V]], i64 1 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <4 x i16> [[V]], i64 2 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <4 x i16> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i64 [[S_2]], [[S_3]] +; CHECK-BE-NEXT: [[X:%.*]] = or disjoint i64 [[X_1]], [[X_3]] +; CHECK-BE-NEXT: ret i64 [[X]] +; +; CHECK-LE-LABEL: define i64 @bitcast.v2i.le.tree.i16( +; CHECK-LE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X:%.*]] = bitcast <4 x i16> [[V]] to i64 +; CHECK-LE-NEXT: ret i64 [[X]] +; + %v.0 = extractelement <4 x i16> %v, i64 0 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <4 x i16> %v, i64 1 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <4 x i16> %v, i64 2 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + + %v.3 = extractelement <4 x i16> %v, i64 3 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %s.2, %s.3 + + %x = or i64 %x.1, %x.3 + + ret i64 %x +} + +define i64 @bitcast.v2i.be.tree.i16(<4 x i16> %v) { +; CHECK-BE-LABEL: define i64 @bitcast.v2i.be.tree.i16( +; CHECK-BE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X:%.*]] = bitcast <4 x i16> [[V]] to i64 +; CHECK-BE-NEXT: ret i64 [[X]] +; +; CHECK-LE-LABEL: define i64 @bitcast.v2i.be.tree.i16( +; CHECK-LE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <4 x i16> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <4 x i16> [[V]], i64 2 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <4 x i16> [[V]], i64 1 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <4 x i16> [[V]], i64 0 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i64 [[S_2]], [[S_3]] +; CHECK-LE-NEXT: [[X:%.*]] = or disjoint i64 [[X_1]], [[X_3]] +; CHECK-LE-NEXT: ret i64 [[X]] +; + %v.0 = extractelement <4 x i16> %v, i64 3 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <4 x i16> %v, i64 2 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <4 x i16> %v, i64 1 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + + %v.3 = extractelement <4 x i16> %v, i64 0 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %s.2, %s.3 + + %x = or i64 %x.1, %x.3 + + ret i64 %x +} + +define i32 @extract.le.i32(<8 x i8> %v) { +; CHECK-BE-LABEL: define i32 @extract.le.i32( +; CHECK-BE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <8 x i8> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <8 x i8> [[V]], i64 4 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <8 x i8> [[V]], i64 5 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-BE-NEXT: [[X_2:%.*]] = or disjoint i32 [[X_1]], [[S_2]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <8 x i8> [[V]], i64 6 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_2]], [[S_3]] +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @extract.le.i32( +; CHECK-LE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3_V_EXTRACT:%.*]] = shufflevector <8 x i8> [[V]], <8 x i8> poison, <8 x i32> +; CHECK-LE-NEXT: [[X_3_V_BC:%.*]] = bitcast <8 x i8> [[X_3_V_EXTRACT]] to <2 x i32> +; CHECK-LE-NEXT: [[X_3_V_EXTRACT1:%.*]] = extractelement <2 x i32> [[X_3_V_BC]], i64 0 +; CHECK-LE-NEXT: ret i32 [[X_3_V_EXTRACT1]] +; + %v.0 = extractelement <8 x i8> %v, i64 3 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <8 x i8> %v, i64 4 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.2 = extractelement <8 x i8> %v, i64 5 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + %x.2 = or i32 %x.1, %s.2 + + %v.3 = extractelement <8 x i8> %v, i64 6 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.2, %s.3 + + ret i32 %x.3 +} + +define i32 @extract.be.i32(<8 x i8> %v) { +; CHECK-BE-LABEL: define i32 @extract.be.i32( +; CHECK-BE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3_V_EXTRACT:%.*]] = shufflevector <8 x i8> [[V]], <8 x i8> poison, <8 x i32> +; CHECK-BE-NEXT: [[X_3_V_BC:%.*]] = bitcast <8 x i8> [[X_3_V_EXTRACT]] to <2 x i32> +; CHECK-BE-NEXT: [[X_3_V_EXTRACT1:%.*]] = extractelement <2 x i32> [[X_3_V_BC]], i64 0 +; CHECK-BE-NEXT: ret i32 [[X_3_V_EXTRACT1]] +; +; CHECK-LE-LABEL: define i32 @extract.be.i32( +; CHECK-LE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <8 x i8> [[V]], i64 6 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <8 x i8> [[V]], i64 5 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <8 x i8> [[V]], i64 4 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-LE-NEXT: [[X_2:%.*]] = or disjoint i32 [[X_1]], [[S_2]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <8 x i8> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_2]], [[S_3]] +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <8 x i8> %v, i64 6 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <8 x i8> %v, i64 5 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.2 = extractelement <8 x i8> %v, i64 4 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + %x.2 = or i32 %x.1, %s.2 + + %v.3 = extractelement <8 x i8> %v, i64 3 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.2, %s.3 + + ret i32 %x.3 +} + +define i64 @extract.le.i64(<8 x i16> %v) { +; CHECK-BE-LABEL: define i64 @extract.le.i64( +; CHECK-BE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <8 x i16> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <8 x i16> [[V]], i64 4 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <8 x i16> [[V]], i64 5 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-BE-NEXT: [[X_2:%.*]] = or disjoint i64 [[X_1]], [[S_2]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <8 x i16> [[V]], i64 6 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_2]], [[S_3]] +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @extract.le.i64( +; CHECK-LE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3_V_EXTRACT:%.*]] = shufflevector <8 x i16> [[V]], <8 x i16> poison, <8 x i32> +; CHECK-LE-NEXT: [[X_3_V_BC:%.*]] = bitcast <8 x i16> [[X_3_V_EXTRACT]] to <2 x i64> +; CHECK-LE-NEXT: [[X_3_V_EXTRACT1:%.*]] = extractelement <2 x i64> [[X_3_V_BC]], i64 0 +; CHECK-LE-NEXT: ret i64 [[X_3_V_EXTRACT1]] +; + %v.0 = extractelement <8 x i16> %v, i64 3 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <8 x i16> %v, i64 4 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <8 x i16> %v, i64 5 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + %x.2 = or i64 %x.1, %s.2 + + %v.3 = extractelement <8 x i16> %v, i64 6 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.2, %s.3 + + ret i64 %x.3 +} + +define i64 @extract.be.i64(<8 x i16> %v) { +; CHECK-BE-LABEL: define i64 @extract.be.i64( +; CHECK-BE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3_V_EXTRACT:%.*]] = shufflevector <8 x i16> [[V]], <8 x i16> poison, <8 x i32> +; CHECK-BE-NEXT: [[X_3_V_BC:%.*]] = bitcast <8 x i16> [[X_3_V_EXTRACT]] to <2 x i64> +; CHECK-BE-NEXT: [[X_3_V_EXTRACT1:%.*]] = extractelement <2 x i64> [[X_3_V_BC]], i64 0 +; CHECK-BE-NEXT: ret i64 [[X_3_V_EXTRACT1]] +; +; CHECK-LE-LABEL: define i64 @extract.be.i64( +; CHECK-LE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <8 x i16> [[V]], i64 6 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <8 x i16> [[V]], i64 5 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <8 x i16> [[V]], i64 4 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-LE-NEXT: [[X_2:%.*]] = or disjoint i64 [[X_1]], [[S_2]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <8 x i16> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_2]], [[S_3]] +; CHECK-LE-NEXT: ret i64 [[X_3]] +; + %v.0 = extractelement <8 x i16> %v, i64 6 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <8 x i16> %v, i64 5 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <8 x i16> %v, i64 4 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + %x.2 = or i64 %x.1, %s.2 + + %v.3 = extractelement <8 x i16> %v, i64 3 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.2, %s.3 + + ret i64 %x.3 +} + +define i32 @partial.le(<4 x i8> %v) { +; CHECK-BE-LABEL: define i32 @partial.le( +; CHECK-BE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <4 x i8> [[V]], i64 0 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <4 x i8> [[V]], i64 1 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <4 x i8> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_1]], [[S_3]] +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @partial.le( +; CHECK-LE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3_V1:%.*]] = insertelement <4 x i8> [[V]], i8 0, i64 2 +; CHECK-LE-NEXT: [[X_3:%.*]] = bitcast <4 x i8> [[X_3_V1]] to i32 +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <4 x i8> %v, i64 0 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <4 x i8> %v, i64 1 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.3 = extractelement <4 x i8> %v, i64 3 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.1, %s.3 + + ret i32 %x.3 +} + +define i32 @partial.be(<4 x i8> %v) { +; CHECK-BE-LABEL: define i32 @partial.be( +; CHECK-BE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3_V1:%.*]] = insertelement <4 x i8> [[V]], i8 0, i64 2 +; CHECK-BE-NEXT: [[X_3:%.*]] = bitcast <4 x i8> [[X_3_V1]] to i32 +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @partial.be( +; CHECK-LE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <4 x i8> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <4 x i8> [[V]], i64 1 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 16 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <4 x i8> [[V]], i64 0 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_1]], [[S_3]] +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <4 x i8> %v, i64 3 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <4 x i8> %v, i64 1 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 16 + %x.1 = or i32 %z.0, %s.1 + + %v.3 = extractelement <4 x i8> %v, i64 0 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.1, %s.3 + + ret i32 %x.3 +} + + +define i64 @partial.le.i16(<4 x i16> %v) { +; CHECK-BE-LABEL: define i64 @partial.le.i16( +; CHECK-BE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <4 x i16> [[V]], i64 0 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <4 x i16> [[V]], i64 1 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <4 x i16> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_1]], [[S_3]] +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @partial.le.i16( +; CHECK-LE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3_V1:%.*]] = insertelement <4 x i16> [[V]], i16 0, i64 2 +; CHECK-LE-NEXT: [[X_3:%.*]] = bitcast <4 x i16> [[X_3_V1]] to i64 +; CHECK-LE-NEXT: ret i64 [[X_3]] +; + %v.0 = extractelement <4 x i16> %v, i64 0 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <4 x i16> %v, i64 1 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.3 = extractelement <4 x i16> %v, i64 3 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.1, %s.3 + + ret i64 %x.3 +} + +define i64 @partial.be.i16(<4 x i16> %v) { +; CHECK-BE-LABEL: define i64 @partial.be.i16( +; CHECK-BE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3_V1:%.*]] = insertelement <4 x i16> [[V]], i16 0, i64 2 +; CHECK-BE-NEXT: [[X_3:%.*]] = bitcast <4 x i16> [[X_3_V1]] to i64 +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @partial.be.i16( +; CHECK-LE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <4 x i16> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <4 x i16> [[V]], i64 1 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 32 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <4 x i16> [[V]], i64 0 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_1]], [[S_3]] +; CHECK-LE-NEXT: ret i64 [[X_3]] +; + %v.0 = extractelement <4 x i16> %v, i64 3 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <4 x i16> %v, i64 1 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 32 + %x.1 = or i64 %z.0, %s.1 + + %v.3 = extractelement <4 x i16> %v, i64 0 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.1, %s.3 + + ret i64 %x.3 +} + +define i32 @partial.extract.le.i32(<8 x i8> %v) { +; CHECK-BE-LABEL: define i32 @partial.extract.le.i32( +; CHECK-BE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <8 x i8> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <8 x i8> [[V]], i64 4 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <8 x i8> [[V]], i64 6 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_1]], [[S_3]] +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @partial.extract.le.i32( +; CHECK-LE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3_V:%.*]] = shufflevector <8 x i8> [[V]], <8 x i8> , <4 x i32> +; CHECK-LE-NEXT: [[X_3:%.*]] = bitcast <4 x i8> [[X_3_V]] to i32 +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <8 x i8> %v, i64 3 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <8 x i8> %v, i64 4 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.3 = extractelement <8 x i8> %v, i64 6 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.1, %s.3 + + ret i32 %x.3 +} + +define i32 @partial.extract.be.i32(<8 x i8> %v) { +; CHECK-BE-LABEL: define i32 @partial.extract.be.i32( +; CHECK-BE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3_V:%.*]] = shufflevector <8 x i8> [[V]], <8 x i8> , <4 x i32> +; CHECK-BE-NEXT: [[X_3:%.*]] = bitcast <4 x i8> [[X_3_V]] to i32 +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @partial.extract.be.i32( +; CHECK-LE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <8 x i8> [[V]], i64 6 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <8 x i8> [[V]], i64 4 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-LE-NEXT: [[X_2:%.*]] = or disjoint i32 [[S_2]], [[Z_0]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <8 x i8> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_2]], [[S_3]] +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <8 x i8> %v, i64 6 + %z.0 = zext i8 %v.0 to i32 + + %v.2 = extractelement <8 x i8> %v, i64 4 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + %x.2 = or i32 %z.0, %s.2 + + %v.3 = extractelement <8 x i8> %v, i64 3 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.2, %s.3 + + ret i32 %x.3 +} + +define i64 @partial.extract.le.i64(<8 x i16> %v) { +; CHECK-BE-LABEL: define i64 @partial.extract.le.i64( +; CHECK-BE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <8 x i16> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <8 x i16> [[V]], i64 4 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <8 x i16> [[V]], i64 5 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-BE-NEXT: [[X_2:%.*]] = or disjoint i64 [[X_1]], [[S_2]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <8 x i16> [[V]], i64 6 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_2]], [[S_3]] +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @partial.extract.le.i64( +; CHECK-LE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3_V_EXTRACT:%.*]] = shufflevector <8 x i16> [[V]], <8 x i16> poison, <8 x i32> +; CHECK-LE-NEXT: [[X_3_V_BC:%.*]] = bitcast <8 x i16> [[X_3_V_EXTRACT]] to <2 x i64> +; CHECK-LE-NEXT: [[X_3_V_EXTRACT1:%.*]] = extractelement <2 x i64> [[X_3_V_BC]], i64 0 +; CHECK-LE-NEXT: ret i64 [[X_3_V_EXTRACT1]] +; + %v.0 = extractelement <8 x i16> %v, i64 3 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <8 x i16> %v, i64 4 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <8 x i16> %v, i64 5 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + %x.2 = or i64 %x.1, %s.2 + + %v.3 = extractelement <8 x i16> %v, i64 6 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.2, %s.3 + + ret i64 %x.3 +} + +define i64 @partial.extract.be.i64(<8 x i16> %v) { +; CHECK-BE-LABEL: define i64 @partial.extract.be.i64( +; CHECK-BE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3_V:%.*]] = shufflevector <8 x i16> [[V]], <8 x i16> , <4 x i32> +; CHECK-BE-NEXT: [[X_3:%.*]] = bitcast <4 x i16> [[X_3_V]] to i64 +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @partial.extract.be.i64( +; CHECK-LE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <8 x i16> [[V]], i64 6 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <8 x i16> [[V]], i64 4 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-LE-NEXT: [[X_2:%.*]] = or disjoint i64 [[S_2]], [[Z_0]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <8 x i16> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_2]], [[S_3]] +; CHECK-LE-NEXT: ret i64 [[X_3]] +; + %v.0 = extractelement <8 x i16> %v, i64 6 + %z.0 = zext i16 %v.0 to i64 + + %v.2 = extractelement <8 x i16> %v, i64 4 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + %x.2 = or i64 %z.0, %s.2 + + %v.3 = extractelement <8 x i16> %v, i64 3 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.2, %s.3 + + ret i64 %x.3 +} + +define <2 x i16> @shufflecast.v2v(<4 x i8> %v) { +; CHECK-LABEL: define <2 x i16> @shufflecast.v2v( +; CHECK-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-NEXT: [[W_3:%.*]] = bitcast <4 x i8> [[V]] to <2 x i16> +; CHECK-NEXT: ret <2 x i16> [[W_3]] +; + %v.0 = shufflevector <4 x i8> %v, <4 x i8> zeroinitializer, <4 x i32> + %c.0 = bitcast <4 x i8> %v.0 to <2 x i16> + + %v.1 = shufflevector <4 x i8> %v, <4 x i8> zeroinitializer, <4 x i32> + %c.1 = bitcast <4 x i8> %v.1 to <2 x i16> + %w.1 = or <2 x i16> %c.0, %c.1 + + %v.2 = shufflevector <4 x i8> %v, <4 x i8> zeroinitializer, <4 x i32> + %c.2 = bitcast <4 x i8> %v.2 to <2 x i16> + %w.2 = or <2 x i16> %w.1, %c.2 + + %v.3 = shufflevector <4 x i8> %v, <4 x i8> zeroinitializer, <4 x i32> + %c.3 = bitcast <4 x i8> %v.3 to <2 x i16> + %w.3 = or <2 x i16> %w.2, %c.3 + + ret <2 x i16> %w.3 +} + +define <2 x i32> @shufflecast.v2v.i16(<4 x i16> %v) { +; CHECK-LABEL: define <2 x i32> @shufflecast.v2v.i16( +; CHECK-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-NEXT: [[W_3:%.*]] = bitcast <4 x i16> [[V]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[W_3]] +; + %v.0 = shufflevector <4 x i16> %v, <4 x i16> zeroinitializer, <4 x i32> + %c.0 = bitcast <4 x i16> %v.0 to <2 x i32> + + %v.1 = shufflevector <4 x i16> %v, <4 x i16> zeroinitializer, <4 x i32> + %c.1 = bitcast <4 x i16> %v.1 to <2 x i32> + %w.1 = or <2 x i32> %c.0, %c.1 + + %v.2 = shufflevector <4 x i16> %v, <4 x i16> zeroinitializer, <4 x i32> + %c.2 = bitcast <4 x i16> %v.2 to <2 x i32> + %w.2 = or <2 x i32> %w.1, %c.2 + + %v.3 = shufflevector <4 x i16> %v, <4 x i16> zeroinitializer, <4 x i32> + %c.3 = bitcast <4 x i16> %v.3 to <2 x i32> + %w.3 = or <2 x i32> %w.2, %c.3 + + ret <2 x i32> %w.3 +} + +define i32 @bitcast.v2i.half(<2 x half> %v) { +; CHECK-LABEL: define i32 @bitcast.v2i.half( +; CHECK-SAME: <2 x half> [[V:%.*]]) { +; CHECK-NEXT: [[X:%.*]] = bitcast <2 x half> [[V]] to i32 +; CHECK-NEXT: ret i32 [[X]] +; + %v.0 = insertelement <2 x half> %v, half 0.0, i64 1 + %x.0 = bitcast <2 x half> %v.0 to i32 + + %v.1 = insertelement <2 x half> %v, half 0.0, i64 0 + %x.1 = bitcast <2 x half> %v.1 to i32 + + %x = or i32 %x.0, %x.1 + ret i32 %x +} diff --git a/llvm/test/Transforms/LoopUnroll/loop-branch-folding.ll b/llvm/test/Transforms/LoopUnroll/loop-branch-folding.ll new file mode 100644 index 0000000000000..57d7320bada5c --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/loop-branch-folding.ll @@ -0,0 +1,937 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 + ; RUN: opt < %s -S -passes=simplifycfg | FileCheck %s --check-prefixes=CHECK-CFG + ; RUN: opt < %s -S -passes=simplifycfg,loop-unroll --unroll-max-upperbound=17 | FileCheck %s --check-prefixes=CHECK-UNROLL + + ; This test designed to check: + ; We can still unroll loop with 'pragma unroll' if loop count(trip count) was destroyed by previous optimization. + ; For exmaple, in following test, loop condition "Dim < 16" was 'merged' with "Dim == Dims" in folding branches + ; at simplifycfg. But if custumer mark the loop with "#pragma unroll", we can still successfully unroll it under + ; unroll-max-upperbound. + ; + ; __device__ void func(int Idx, int *Arr[], int Dims, int *Out) { + ; #pragma unroll + ; for (int Dim = 0; Dim < 16; ++Dim) { + ; if (Dim == Dims) { + ; break; + ; } + ; int divmod = Arr[Dim][Idx]; + ; Idx = divmod + 1; + ; + ; for (int arg = 0; arg < 4; arg++) { + ; Out[arg] += Arr[Dim][arg]; + ; bar(); + ; } + ; } + ; } + + define void @func(i32 noundef %Idx, ptr noundef %Arr, i32 noundef %Dims, ptr noundef %Out) { + ; CHECK-CFG-LABEL: define void @func( + ; CHECK-CFG-SAME: i32 noundef [[IDX:%.*]], ptr noundef [[ARR:%.*]], i32 noundef [[DIMS:%.*]], ptr noundef [[OUT:%.*]]) { + ; CHECK-CFG-NEXT: entry: + ; CHECK-CFG-NEXT: br label [[FOR_COND:%.*]] + ; CHECK-CFG: for.cond: + ; CHECK-CFG-NEXT: [[DIM_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC16:%.*]], [[FOR_COND_CLEANUP6:%.*]] ] + ; CHECK-CFG-NEXT: [[IDX_ADDR_0:%.*]] = phi i32 [ [[IDX]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_COND_CLEANUP6]] ] + ; CHECK-CFG-NEXT: [[CMP:%.*]] = icmp sge i32 [[DIM_0]], 16 + ; CHECK-CFG-NEXT: [[CMP1:%.*]] = icmp eq i32 [[DIM_0]], [[DIMS]] + ; CHECK-CFG-NEXT: [[OR_COND:%.*]] = or i1 [[CMP]], [[CMP1]] + ; CHECK-CFG-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] + ; CHECK-CFG: if.end: + ; CHECK-CFG-NEXT: [[IDXPROM:%.*]] = sext i32 [[DIM_0]] to i64 + ; CHECK-CFG-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 [[IDXPROM]] + ; CHECK-CFG-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 + ; CHECK-CFG-NEXT: [[IDXPROM2:%.*]] = sext i32 [[IDX_ADDR_0]] to i64 + ; CHECK-CFG-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[IDXPROM2]] + ; CHECK-CFG-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 + ; CHECK-CFG-NEXT: [[ADD]] = add nsw i32 [[TMP1]], 1 + ; CHECK-CFG-NEXT: br label [[FOR_COND4:%.*]] + ; CHECK-CFG: for.cond4: + ; CHECK-CFG-NEXT: [[ARG_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[INC:%.*]], [[FOR_BODY7:%.*]] ] + ; CHECK-CFG-NEXT: [[CMP5:%.*]] = icmp slt i32 [[ARG_0]], 4 + ; CHECK-CFG-NEXT: br i1 [[CMP5]], label [[FOR_BODY7]], label [[FOR_COND_CLEANUP6]] + ; CHECK-CFG: for.cond.cleanup6: + ; CHECK-CFG-NEXT: [[INC16]] = add nsw i32 [[DIM_0]], 1 + ; CHECK-CFG-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]] + ; CHECK-CFG: for.body7: + ; CHECK-CFG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 + ; CHECK-CFG-NEXT: [[IDXPROM10:%.*]] = sext i32 [[ARG_0]] to i64 + ; CHECK-CFG-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[IDXPROM10]] + ; CHECK-CFG-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4 + ; CHECK-CFG-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IDXPROM10]] + ; CHECK-CFG-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 + ; CHECK-CFG-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP4]], [[TMP3]] + ; CHECK-CFG-NEXT: store i32 [[ADD14]], ptr [[ARRAYIDX13]], align 4 + ; CHECK-CFG-NEXT: call void @_Z3barv() + ; CHECK-CFG-NEXT: [[INC]] = add nsw i32 [[ARG_0]], 1 + ; CHECK-CFG-NEXT: br label [[FOR_COND4]], !llvm.loop [[LOOP3:![0-9]+]] + ; CHECK-CFG: cleanup: + ; CHECK-CFG-NEXT: ret void + ; + ; CHECK-UNROLL-LABEL: define void @func( + ; CHECK-UNROLL-SAME: i32 noundef [[IDX:%.*]], ptr noundef [[ARR:%.*]], i32 noundef [[DIMS:%.*]], ptr noundef [[OUT:%.*]]) { + ; CHECK-UNROLL-NEXT: entry: + ; CHECK-UNROLL-NEXT: br label [[FOR_COND:%.*]] + ; CHECK-UNROLL: for.cond: + ; CHECK-UNROLL-NEXT: [[CMP1:%.*]] = icmp eq i32 0, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] + ; CHECK-UNROLL: if.end: + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4:%.*]] + ; CHECK-UNROLL: for.cond4: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6: + ; CHECK-UNROLL-NEXT: [[CMP1_1:%.*]] = icmp eq i32 1, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_1]], label [[CLEANUP]], label [[IF_END_1:%.*]] + ; CHECK-UNROLL: if.end.1: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 1 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_1:%.*]] + ; CHECK-UNROLL: for.cond4.1: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_12:%.*]] + ; CHECK-UNROLL: for.body7.12: + ; CHECK-UNROLL-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_1]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP2:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_11:%.*]] = add nsw i32 [[TMP2]], [[TMP1]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_11]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_1:%.*]] + ; CHECK-UNROLL: for.body7.1.1: + ; CHECK-UNROLL-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX_1]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX11_1_1]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_1:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX13_1_1]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_1:%.*]] = add nsw i32 [[TMP5]], [[TMP4]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_1]], ptr [[ARRAYIDX13_1_1]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_1:%.*]] + ; CHECK-UNROLL: for.body7.2.1: + ; CHECK-UNROLL-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX_1]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_1:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX11_2_1]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_1:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX13_2_1]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_1:%.*]] = add nsw i32 [[TMP8]], [[TMP7]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_1]], ptr [[ARRAYIDX13_2_1]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_1:%.*]] + ; CHECK-UNROLL: for.body7.3.1: + ; CHECK-UNROLL-NEXT: [[TMP9:%.*]] = load ptr, ptr [[ARRAYIDX_1]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX11_3_1]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_1:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX13_3_1]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_1:%.*]] = add nsw i32 [[TMP11]], [[TMP10]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_1]], ptr [[ARRAYIDX13_3_1]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4:%.*]], label [[FOR_COND_CLEANUP6_1:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.1: + ; CHECK-UNROLL-NEXT: [[CMP1_2:%.*]] = icmp eq i32 2, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_2]], label [[CLEANUP]], label [[IF_END_2:%.*]] + ; CHECK-UNROLL: if.end.2: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 2 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_2:%.*]] + ; CHECK-UNROLL: for.cond4.2: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_24:%.*]] + ; CHECK-UNROLL: for.body7.24: + ; CHECK-UNROLL-NEXT: [[TMP12:%.*]] = load ptr, ptr [[ARRAYIDX_2]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP14:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_23:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_23]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_2:%.*]] + ; CHECK-UNROLL: for.body7.1.2: + ; CHECK-UNROLL-NEXT: [[TMP15:%.*]] = load ptr, ptr [[ARRAYIDX_2]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_2:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX11_1_2]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_2:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX13_1_2]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_2:%.*]] = add nsw i32 [[TMP17]], [[TMP16]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_2]], ptr [[ARRAYIDX13_1_2]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_2:%.*]] + ; CHECK-UNROLL: for.body7.2.2: + ; CHECK-UNROLL-NEXT: [[TMP18:%.*]] = load ptr, ptr [[ARRAYIDX_2]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_2:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX11_2_2]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_2:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX13_2_2]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_2:%.*]] = add nsw i32 [[TMP20]], [[TMP19]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_2]], ptr [[ARRAYIDX13_2_2]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_2:%.*]] + ; CHECK-UNROLL: for.body7.3.2: + ; CHECK-UNROLL-NEXT: [[TMP21:%.*]] = load ptr, ptr [[ARRAYIDX_2]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_2:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX11_3_2]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_2:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX13_3_2]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_2:%.*]] = add nsw i32 [[TMP23]], [[TMP22]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_2]], ptr [[ARRAYIDX13_3_2]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_2:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.2: + ; CHECK-UNROLL-NEXT: [[CMP1_3:%.*]] = icmp eq i32 3, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_3]], label [[CLEANUP]], label [[IF_END_3:%.*]] + ; CHECK-UNROLL: if.end.3: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 3 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_3:%.*]] + ; CHECK-UNROLL: for.cond4.3: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_36:%.*]] + ; CHECK-UNROLL: for.body7.36: + ; CHECK-UNROLL-NEXT: [[TMP24:%.*]] = load ptr, ptr [[ARRAYIDX_3]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP26:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_35:%.*]] = add nsw i32 [[TMP26]], [[TMP25]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_35]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_3:%.*]] + ; CHECK-UNROLL: for.body7.1.3: + ; CHECK-UNROLL-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX_3]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_3:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX11_1_3]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_3:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX13_1_3]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_3:%.*]] = add nsw i32 [[TMP29]], [[TMP28]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_3]], ptr [[ARRAYIDX13_1_3]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_3:%.*]] + ; CHECK-UNROLL: for.body7.2.3: + ; CHECK-UNROLL-NEXT: [[TMP30:%.*]] = load ptr, ptr [[ARRAYIDX_3]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_3:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX11_2_3]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_3:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX13_2_3]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_3:%.*]] = add nsw i32 [[TMP32]], [[TMP31]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_3]], ptr [[ARRAYIDX13_2_3]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_3:%.*]] + ; CHECK-UNROLL: for.body7.3.3: + ; CHECK-UNROLL-NEXT: [[TMP33:%.*]] = load ptr, ptr [[ARRAYIDX_3]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_3:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX11_3_3]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_3:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX13_3_3]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_3:%.*]] = add nsw i32 [[TMP35]], [[TMP34]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_3]], ptr [[ARRAYIDX13_3_3]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_3:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.3: + ; CHECK-UNROLL-NEXT: [[CMP1_4:%.*]] = icmp eq i32 4, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_4]], label [[CLEANUP]], label [[IF_END_4:%.*]] + ; CHECK-UNROLL: if.end.4: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 4 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_4:%.*]] + ; CHECK-UNROLL: for.cond4.4: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_48:%.*]] + ; CHECK-UNROLL: for.body7.48: + ; CHECK-UNROLL-NEXT: [[TMP36:%.*]] = load ptr, ptr [[ARRAYIDX_4]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP38:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_47:%.*]] = add nsw i32 [[TMP38]], [[TMP37]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_47]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_4:%.*]] + ; CHECK-UNROLL: for.body7.1.4: + ; CHECK-UNROLL-NEXT: [[TMP39:%.*]] = load ptr, ptr [[ARRAYIDX_4]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_4:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP40:%.*]] = load i32, ptr [[ARRAYIDX11_1_4]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_4:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX13_1_4]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_4:%.*]] = add nsw i32 [[TMP41]], [[TMP40]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_4]], ptr [[ARRAYIDX13_1_4]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_4:%.*]] + ; CHECK-UNROLL: for.body7.2.4: + ; CHECK-UNROLL-NEXT: [[TMP42:%.*]] = load ptr, ptr [[ARRAYIDX_4]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_4:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX11_2_4]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_4:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP44:%.*]] = load i32, ptr [[ARRAYIDX13_2_4]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_4:%.*]] = add nsw i32 [[TMP44]], [[TMP43]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_4]], ptr [[ARRAYIDX13_2_4]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_4:%.*]] + ; CHECK-UNROLL: for.body7.3.4: + ; CHECK-UNROLL-NEXT: [[TMP45:%.*]] = load ptr, ptr [[ARRAYIDX_4]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_4:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP46:%.*]] = load i32, ptr [[ARRAYIDX11_3_4]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_4:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP47:%.*]] = load i32, ptr [[ARRAYIDX13_3_4]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_4:%.*]] = add nsw i32 [[TMP47]], [[TMP46]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_4]], ptr [[ARRAYIDX13_3_4]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_4:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.4: + ; CHECK-UNROLL-NEXT: [[CMP1_5:%.*]] = icmp eq i32 5, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_5]], label [[CLEANUP]], label [[IF_END_5:%.*]] + ; CHECK-UNROLL: if.end.5: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 5 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_5:%.*]] + ; CHECK-UNROLL: for.cond4.5: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_5:%.*]] + ; CHECK-UNROLL: for.body7.5: + ; CHECK-UNROLL-NEXT: [[TMP48:%.*]] = load ptr, ptr [[ARRAYIDX_5]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP48]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP50:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_5:%.*]] = add nsw i32 [[TMP50]], [[TMP49]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_5]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_5:%.*]] + ; CHECK-UNROLL: for.body7.1.5: + ; CHECK-UNROLL-NEXT: [[TMP51:%.*]] = load ptr, ptr [[ARRAYIDX_5]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_5:%.*]] = getelementptr inbounds i32, ptr [[TMP51]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP52:%.*]] = load i32, ptr [[ARRAYIDX11_1_5]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_5:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP53:%.*]] = load i32, ptr [[ARRAYIDX13_1_5]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_5:%.*]] = add nsw i32 [[TMP53]], [[TMP52]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_5]], ptr [[ARRAYIDX13_1_5]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_5:%.*]] + ; CHECK-UNROLL: for.body7.2.5: + ; CHECK-UNROLL-NEXT: [[TMP54:%.*]] = load ptr, ptr [[ARRAYIDX_5]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_5:%.*]] = getelementptr inbounds i32, ptr [[TMP54]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP55:%.*]] = load i32, ptr [[ARRAYIDX11_2_5]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_5:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP56:%.*]] = load i32, ptr [[ARRAYIDX13_2_5]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_5:%.*]] = add nsw i32 [[TMP56]], [[TMP55]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_5]], ptr [[ARRAYIDX13_2_5]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_5:%.*]] + ; CHECK-UNROLL: for.body7.3.5: + ; CHECK-UNROLL-NEXT: [[TMP57:%.*]] = load ptr, ptr [[ARRAYIDX_5]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_5:%.*]] = getelementptr inbounds i32, ptr [[TMP57]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP58:%.*]] = load i32, ptr [[ARRAYIDX11_3_5]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_5:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP59:%.*]] = load i32, ptr [[ARRAYIDX13_3_5]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_5:%.*]] = add nsw i32 [[TMP59]], [[TMP58]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_5]], ptr [[ARRAYIDX13_3_5]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_5:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.5: + ; CHECK-UNROLL-NEXT: [[CMP1_6:%.*]] = icmp eq i32 6, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_6]], label [[CLEANUP]], label [[IF_END_6:%.*]] + ; CHECK-UNROLL: if.end.6: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 6 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_6:%.*]] + ; CHECK-UNROLL: for.cond4.6: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_6:%.*]] + ; CHECK-UNROLL: for.body7.6: + ; CHECK-UNROLL-NEXT: [[TMP60:%.*]] = load ptr, ptr [[ARRAYIDX_6]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP61:%.*]] = load i32, ptr [[TMP60]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP62:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_6:%.*]] = add nsw i32 [[TMP62]], [[TMP61]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_6]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_6:%.*]] + ; CHECK-UNROLL: for.body7.1.6: + ; CHECK-UNROLL-NEXT: [[TMP63:%.*]] = load ptr, ptr [[ARRAYIDX_6]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_6:%.*]] = getelementptr inbounds i32, ptr [[TMP63]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP64:%.*]] = load i32, ptr [[ARRAYIDX11_1_6]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_6:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP65:%.*]] = load i32, ptr [[ARRAYIDX13_1_6]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_6:%.*]] = add nsw i32 [[TMP65]], [[TMP64]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_6]], ptr [[ARRAYIDX13_1_6]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_6:%.*]] + ; CHECK-UNROLL: for.body7.2.6: + ; CHECK-UNROLL-NEXT: [[TMP66:%.*]] = load ptr, ptr [[ARRAYIDX_6]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_6:%.*]] = getelementptr inbounds i32, ptr [[TMP66]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP67:%.*]] = load i32, ptr [[ARRAYIDX11_2_6]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_6:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP68:%.*]] = load i32, ptr [[ARRAYIDX13_2_6]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_6:%.*]] = add nsw i32 [[TMP68]], [[TMP67]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_6]], ptr [[ARRAYIDX13_2_6]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_6:%.*]] + ; CHECK-UNROLL: for.body7.3.6: + ; CHECK-UNROLL-NEXT: [[TMP69:%.*]] = load ptr, ptr [[ARRAYIDX_6]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_6:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP70:%.*]] = load i32, ptr [[ARRAYIDX11_3_6]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_6:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP71:%.*]] = load i32, ptr [[ARRAYIDX13_3_6]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_6:%.*]] = add nsw i32 [[TMP71]], [[TMP70]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_6]], ptr [[ARRAYIDX13_3_6]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_6:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.6: + ; CHECK-UNROLL-NEXT: [[CMP1_7:%.*]] = icmp eq i32 7, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_7]], label [[CLEANUP]], label [[IF_END_7:%.*]] + ; CHECK-UNROLL: if.end.7: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 7 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_7:%.*]] + ; CHECK-UNROLL: for.cond4.7: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_7:%.*]] + ; CHECK-UNROLL: for.body7.7: + ; CHECK-UNROLL-NEXT: [[TMP72:%.*]] = load ptr, ptr [[ARRAYIDX_7]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP73:%.*]] = load i32, ptr [[TMP72]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP74:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_7:%.*]] = add nsw i32 [[TMP74]], [[TMP73]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_7]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_7:%.*]] + ; CHECK-UNROLL: for.body7.1.7: + ; CHECK-UNROLL-NEXT: [[TMP75:%.*]] = load ptr, ptr [[ARRAYIDX_7]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_7:%.*]] = getelementptr inbounds i32, ptr [[TMP75]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP76:%.*]] = load i32, ptr [[ARRAYIDX11_1_7]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_7:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP77:%.*]] = load i32, ptr [[ARRAYIDX13_1_7]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_7:%.*]] = add nsw i32 [[TMP77]], [[TMP76]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_7]], ptr [[ARRAYIDX13_1_7]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_7:%.*]] + ; CHECK-UNROLL: for.body7.2.7: + ; CHECK-UNROLL-NEXT: [[TMP78:%.*]] = load ptr, ptr [[ARRAYIDX_7]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_7:%.*]] = getelementptr inbounds i32, ptr [[TMP78]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP79:%.*]] = load i32, ptr [[ARRAYIDX11_2_7]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_7:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP80:%.*]] = load i32, ptr [[ARRAYIDX13_2_7]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_7:%.*]] = add nsw i32 [[TMP80]], [[TMP79]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_7]], ptr [[ARRAYIDX13_2_7]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_7:%.*]] + ; CHECK-UNROLL: for.body7.3.7: + ; CHECK-UNROLL-NEXT: [[TMP81:%.*]] = load ptr, ptr [[ARRAYIDX_7]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_7:%.*]] = getelementptr inbounds i32, ptr [[TMP81]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP82:%.*]] = load i32, ptr [[ARRAYIDX11_3_7]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_7:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP83:%.*]] = load i32, ptr [[ARRAYIDX13_3_7]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_7:%.*]] = add nsw i32 [[TMP83]], [[TMP82]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_7]], ptr [[ARRAYIDX13_3_7]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_7:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.7: + ; CHECK-UNROLL-NEXT: [[CMP1_8:%.*]] = icmp eq i32 8, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_8]], label [[CLEANUP]], label [[IF_END_8:%.*]] + ; CHECK-UNROLL: if.end.8: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 8 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_8:%.*]] + ; CHECK-UNROLL: for.cond4.8: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_8:%.*]] + ; CHECK-UNROLL: for.body7.8: + ; CHECK-UNROLL-NEXT: [[TMP84:%.*]] = load ptr, ptr [[ARRAYIDX_8]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP85:%.*]] = load i32, ptr [[TMP84]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP86:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_8:%.*]] = add nsw i32 [[TMP86]], [[TMP85]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_8]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_8:%.*]] + ; CHECK-UNROLL: for.body7.1.8: + ; CHECK-UNROLL-NEXT: [[TMP87:%.*]] = load ptr, ptr [[ARRAYIDX_8]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_8:%.*]] = getelementptr inbounds i32, ptr [[TMP87]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP88:%.*]] = load i32, ptr [[ARRAYIDX11_1_8]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_8:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP89:%.*]] = load i32, ptr [[ARRAYIDX13_1_8]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_8:%.*]] = add nsw i32 [[TMP89]], [[TMP88]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_8]], ptr [[ARRAYIDX13_1_8]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_8:%.*]] + ; CHECK-UNROLL: for.body7.2.8: + ; CHECK-UNROLL-NEXT: [[TMP90:%.*]] = load ptr, ptr [[ARRAYIDX_8]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_8:%.*]] = getelementptr inbounds i32, ptr [[TMP90]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP91:%.*]] = load i32, ptr [[ARRAYIDX11_2_8]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_8:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP92:%.*]] = load i32, ptr [[ARRAYIDX13_2_8]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_8:%.*]] = add nsw i32 [[TMP92]], [[TMP91]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_8]], ptr [[ARRAYIDX13_2_8]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_8:%.*]] + ; CHECK-UNROLL: for.body7.3.8: + ; CHECK-UNROLL-NEXT: [[TMP93:%.*]] = load ptr, ptr [[ARRAYIDX_8]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_8:%.*]] = getelementptr inbounds i32, ptr [[TMP93]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP94:%.*]] = load i32, ptr [[ARRAYIDX11_3_8]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_8:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP95:%.*]] = load i32, ptr [[ARRAYIDX13_3_8]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_8:%.*]] = add nsw i32 [[TMP95]], [[TMP94]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_8]], ptr [[ARRAYIDX13_3_8]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_8:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.8: + ; CHECK-UNROLL-NEXT: [[CMP1_9:%.*]] = icmp eq i32 9, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_9]], label [[CLEANUP]], label [[IF_END_9:%.*]] + ; CHECK-UNROLL: if.end.9: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 9 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_9:%.*]] + ; CHECK-UNROLL: for.cond4.9: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_9:%.*]] + ; CHECK-UNROLL: for.body7.9: + ; CHECK-UNROLL-NEXT: [[TMP96:%.*]] = load ptr, ptr [[ARRAYIDX_9]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP96]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP98:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_9:%.*]] = add nsw i32 [[TMP98]], [[TMP97]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_9]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_9:%.*]] + ; CHECK-UNROLL: for.body7.1.9: + ; CHECK-UNROLL-NEXT: [[TMP99:%.*]] = load ptr, ptr [[ARRAYIDX_9]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_9:%.*]] = getelementptr inbounds i32, ptr [[TMP99]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP100:%.*]] = load i32, ptr [[ARRAYIDX11_1_9]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_9:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP101:%.*]] = load i32, ptr [[ARRAYIDX13_1_9]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_9:%.*]] = add nsw i32 [[TMP101]], [[TMP100]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_9]], ptr [[ARRAYIDX13_1_9]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_9:%.*]] + ; CHECK-UNROLL: for.body7.2.9: + ; CHECK-UNROLL-NEXT: [[TMP102:%.*]] = load ptr, ptr [[ARRAYIDX_9]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_9:%.*]] = getelementptr inbounds i32, ptr [[TMP102]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP103:%.*]] = load i32, ptr [[ARRAYIDX11_2_9]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_9:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP104:%.*]] = load i32, ptr [[ARRAYIDX13_2_9]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_9:%.*]] = add nsw i32 [[TMP104]], [[TMP103]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_9]], ptr [[ARRAYIDX13_2_9]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_9:%.*]] + ; CHECK-UNROLL: for.body7.3.9: + ; CHECK-UNROLL-NEXT: [[TMP105:%.*]] = load ptr, ptr [[ARRAYIDX_9]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_9:%.*]] = getelementptr inbounds i32, ptr [[TMP105]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP106:%.*]] = load i32, ptr [[ARRAYIDX11_3_9]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_9:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP107:%.*]] = load i32, ptr [[ARRAYIDX13_3_9]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_9:%.*]] = add nsw i32 [[TMP107]], [[TMP106]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_9]], ptr [[ARRAYIDX13_3_9]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_9:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.9: + ; CHECK-UNROLL-NEXT: [[CMP1_10:%.*]] = icmp eq i32 10, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_10]], label [[CLEANUP]], label [[IF_END_10:%.*]] + ; CHECK-UNROLL: if.end.10: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 10 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_10:%.*]] + ; CHECK-UNROLL: for.cond4.10: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_10:%.*]] + ; CHECK-UNROLL: for.body7.10: + ; CHECK-UNROLL-NEXT: [[TMP108:%.*]] = load ptr, ptr [[ARRAYIDX_10]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP109:%.*]] = load i32, ptr [[TMP108]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP110:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_10:%.*]] = add nsw i32 [[TMP110]], [[TMP109]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_10]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_10:%.*]] + ; CHECK-UNROLL: for.body7.1.10: + ; CHECK-UNROLL-NEXT: [[TMP111:%.*]] = load ptr, ptr [[ARRAYIDX_10]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_10:%.*]] = getelementptr inbounds i32, ptr [[TMP111]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP112:%.*]] = load i32, ptr [[ARRAYIDX11_1_10]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_10:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP113:%.*]] = load i32, ptr [[ARRAYIDX13_1_10]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_10:%.*]] = add nsw i32 [[TMP113]], [[TMP112]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_10]], ptr [[ARRAYIDX13_1_10]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_10:%.*]] + ; CHECK-UNROLL: for.body7.2.10: + ; CHECK-UNROLL-NEXT: [[TMP114:%.*]] = load ptr, ptr [[ARRAYIDX_10]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_10:%.*]] = getelementptr inbounds i32, ptr [[TMP114]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP115:%.*]] = load i32, ptr [[ARRAYIDX11_2_10]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_10:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP116:%.*]] = load i32, ptr [[ARRAYIDX13_2_10]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_10:%.*]] = add nsw i32 [[TMP116]], [[TMP115]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_10]], ptr [[ARRAYIDX13_2_10]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_10:%.*]] + ; CHECK-UNROLL: for.body7.3.10: + ; CHECK-UNROLL-NEXT: [[TMP117:%.*]] = load ptr, ptr [[ARRAYIDX_10]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_10:%.*]] = getelementptr inbounds i32, ptr [[TMP117]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP118:%.*]] = load i32, ptr [[ARRAYIDX11_3_10]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_10:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP119:%.*]] = load i32, ptr [[ARRAYIDX13_3_10]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_10:%.*]] = add nsw i32 [[TMP119]], [[TMP118]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_10]], ptr [[ARRAYIDX13_3_10]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_10:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.10: + ; CHECK-UNROLL-NEXT: [[CMP1_11:%.*]] = icmp eq i32 11, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_11]], label [[CLEANUP]], label [[IF_END_11:%.*]] + ; CHECK-UNROLL: if.end.11: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 11 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_11:%.*]] + ; CHECK-UNROLL: for.cond4.11: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_11:%.*]] + ; CHECK-UNROLL: for.body7.11: + ; CHECK-UNROLL-NEXT: [[TMP120:%.*]] = load ptr, ptr [[ARRAYIDX_11]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP122:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_119:%.*]] = add nsw i32 [[TMP122]], [[TMP121]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_119]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_11:%.*]] + ; CHECK-UNROLL: for.body7.1.11: + ; CHECK-UNROLL-NEXT: [[TMP123:%.*]] = load ptr, ptr [[ARRAYIDX_11]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_11:%.*]] = getelementptr inbounds i32, ptr [[TMP123]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP124:%.*]] = load i32, ptr [[ARRAYIDX11_1_11]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_11:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP125:%.*]] = load i32, ptr [[ARRAYIDX13_1_11]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_11:%.*]] = add nsw i32 [[TMP125]], [[TMP124]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_11]], ptr [[ARRAYIDX13_1_11]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_11:%.*]] + ; CHECK-UNROLL: for.body7.2.11: + ; CHECK-UNROLL-NEXT: [[TMP126:%.*]] = load ptr, ptr [[ARRAYIDX_11]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_11:%.*]] = getelementptr inbounds i32, ptr [[TMP126]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP127:%.*]] = load i32, ptr [[ARRAYIDX11_2_11]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_11:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP128:%.*]] = load i32, ptr [[ARRAYIDX13_2_11]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_11:%.*]] = add nsw i32 [[TMP128]], [[TMP127]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_11]], ptr [[ARRAYIDX13_2_11]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_11:%.*]] + ; CHECK-UNROLL: for.body7.3.11: + ; CHECK-UNROLL-NEXT: [[TMP129:%.*]] = load ptr, ptr [[ARRAYIDX_11]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_11:%.*]] = getelementptr inbounds i32, ptr [[TMP129]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP130:%.*]] = load i32, ptr [[ARRAYIDX11_3_11]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_11:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP131:%.*]] = load i32, ptr [[ARRAYIDX13_3_11]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_11:%.*]] = add nsw i32 [[TMP131]], [[TMP130]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_11]], ptr [[ARRAYIDX13_3_11]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_11:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.11: + ; CHECK-UNROLL-NEXT: [[CMP1_12:%.*]] = icmp eq i32 12, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_12]], label [[CLEANUP]], label [[IF_END_12:%.*]] + ; CHECK-UNROLL: if.end.12: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 12 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_12:%.*]] + ; CHECK-UNROLL: for.cond4.12: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1210:%.*]] + ; CHECK-UNROLL: for.body7.1210: + ; CHECK-UNROLL-NEXT: [[TMP132:%.*]] = load ptr, ptr [[ARRAYIDX_12]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP133:%.*]] = load i32, ptr [[TMP132]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP134:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_12:%.*]] = add nsw i32 [[TMP134]], [[TMP133]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_12]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_12:%.*]] + ; CHECK-UNROLL: for.body7.1.12: + ; CHECK-UNROLL-NEXT: [[TMP135:%.*]] = load ptr, ptr [[ARRAYIDX_12]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_12:%.*]] = getelementptr inbounds i32, ptr [[TMP135]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP136:%.*]] = load i32, ptr [[ARRAYIDX11_1_12]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_12:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP137:%.*]] = load i32, ptr [[ARRAYIDX13_1_12]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_12:%.*]] = add nsw i32 [[TMP137]], [[TMP136]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_12]], ptr [[ARRAYIDX13_1_12]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_12:%.*]] + ; CHECK-UNROLL: for.body7.2.12: + ; CHECK-UNROLL-NEXT: [[TMP138:%.*]] = load ptr, ptr [[ARRAYIDX_12]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_12:%.*]] = getelementptr inbounds i32, ptr [[TMP138]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP139:%.*]] = load i32, ptr [[ARRAYIDX11_2_12]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_12:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP140:%.*]] = load i32, ptr [[ARRAYIDX13_2_12]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_12:%.*]] = add nsw i32 [[TMP140]], [[TMP139]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_12]], ptr [[ARRAYIDX13_2_12]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_12:%.*]] + ; CHECK-UNROLL: for.body7.3.12: + ; CHECK-UNROLL-NEXT: [[TMP141:%.*]] = load ptr, ptr [[ARRAYIDX_12]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_12:%.*]] = getelementptr inbounds i32, ptr [[TMP141]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP142:%.*]] = load i32, ptr [[ARRAYIDX11_3_12]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_12:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP143:%.*]] = load i32, ptr [[ARRAYIDX13_3_12]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_12:%.*]] = add nsw i32 [[TMP143]], [[TMP142]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_12]], ptr [[ARRAYIDX13_3_12]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_12:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.12: + ; CHECK-UNROLL-NEXT: [[CMP1_13:%.*]] = icmp eq i32 13, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_13]], label [[CLEANUP]], label [[IF_END_13:%.*]] + ; CHECK-UNROLL: if.end.13: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 13 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_13:%.*]] + ; CHECK-UNROLL: for.cond4.13: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_13:%.*]] + ; CHECK-UNROLL: for.body7.13: + ; CHECK-UNROLL-NEXT: [[TMP144:%.*]] = load ptr, ptr [[ARRAYIDX_13]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP145:%.*]] = load i32, ptr [[TMP144]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP146:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_13:%.*]] = add nsw i32 [[TMP146]], [[TMP145]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_13]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_13:%.*]] + ; CHECK-UNROLL: for.body7.1.13: + ; CHECK-UNROLL-NEXT: [[TMP147:%.*]] = load ptr, ptr [[ARRAYIDX_13]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_13:%.*]] = getelementptr inbounds i32, ptr [[TMP147]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP148:%.*]] = load i32, ptr [[ARRAYIDX11_1_13]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_13:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP149:%.*]] = load i32, ptr [[ARRAYIDX13_1_13]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_13:%.*]] = add nsw i32 [[TMP149]], [[TMP148]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_13]], ptr [[ARRAYIDX13_1_13]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_13:%.*]] + ; CHECK-UNROLL: for.body7.2.13: + ; CHECK-UNROLL-NEXT: [[TMP150:%.*]] = load ptr, ptr [[ARRAYIDX_13]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_13:%.*]] = getelementptr inbounds i32, ptr [[TMP150]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP151:%.*]] = load i32, ptr [[ARRAYIDX11_2_13]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_13:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP152:%.*]] = load i32, ptr [[ARRAYIDX13_2_13]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_13:%.*]] = add nsw i32 [[TMP152]], [[TMP151]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_13]], ptr [[ARRAYIDX13_2_13]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_13:%.*]] + ; CHECK-UNROLL: for.body7.3.13: + ; CHECK-UNROLL-NEXT: [[TMP153:%.*]] = load ptr, ptr [[ARRAYIDX_13]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_13:%.*]] = getelementptr inbounds i32, ptr [[TMP153]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP154:%.*]] = load i32, ptr [[ARRAYIDX11_3_13]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_13:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP155:%.*]] = load i32, ptr [[ARRAYIDX13_3_13]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_13:%.*]] = add nsw i32 [[TMP155]], [[TMP154]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_13]], ptr [[ARRAYIDX13_3_13]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_13:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.13: + ; CHECK-UNROLL-NEXT: [[CMP1_14:%.*]] = icmp eq i32 14, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_14]], label [[CLEANUP]], label [[IF_END_14:%.*]] + ; CHECK-UNROLL: if.end.14: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 14 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_14:%.*]] + ; CHECK-UNROLL: for.cond4.14: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_14:%.*]] + ; CHECK-UNROLL: for.body7.14: + ; CHECK-UNROLL-NEXT: [[TMP156:%.*]] = load ptr, ptr [[ARRAYIDX_14]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP157:%.*]] = load i32, ptr [[TMP156]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP158:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_14:%.*]] = add nsw i32 [[TMP158]], [[TMP157]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_14]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_14:%.*]] + ; CHECK-UNROLL: for.body7.1.14: + ; CHECK-UNROLL-NEXT: [[TMP159:%.*]] = load ptr, ptr [[ARRAYIDX_14]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_14:%.*]] = getelementptr inbounds i32, ptr [[TMP159]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP160:%.*]] = load i32, ptr [[ARRAYIDX11_1_14]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_14:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP161:%.*]] = load i32, ptr [[ARRAYIDX13_1_14]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_14:%.*]] = add nsw i32 [[TMP161]], [[TMP160]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_14]], ptr [[ARRAYIDX13_1_14]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_14:%.*]] + ; CHECK-UNROLL: for.body7.2.14: + ; CHECK-UNROLL-NEXT: [[TMP162:%.*]] = load ptr, ptr [[ARRAYIDX_14]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_14:%.*]] = getelementptr inbounds i32, ptr [[TMP162]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP163:%.*]] = load i32, ptr [[ARRAYIDX11_2_14]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_14:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP164:%.*]] = load i32, ptr [[ARRAYIDX13_2_14]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_14:%.*]] = add nsw i32 [[TMP164]], [[TMP163]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_14]], ptr [[ARRAYIDX13_2_14]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_14:%.*]] + ; CHECK-UNROLL: for.body7.3.14: + ; CHECK-UNROLL-NEXT: [[TMP165:%.*]] = load ptr, ptr [[ARRAYIDX_14]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_14:%.*]] = getelementptr inbounds i32, ptr [[TMP165]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP166:%.*]] = load i32, ptr [[ARRAYIDX11_3_14]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_14:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP167:%.*]] = load i32, ptr [[ARRAYIDX13_3_14]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_14:%.*]] = add nsw i32 [[TMP167]], [[TMP166]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_14]], ptr [[ARRAYIDX13_3_14]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_14:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.14: + ; CHECK-UNROLL-NEXT: [[CMP1_15:%.*]] = icmp eq i32 15, [[DIMS]] + ; CHECK-UNROLL-NEXT: br i1 [[CMP1_15]], label [[CLEANUP]], label [[IF_END_15:%.*]] + ; CHECK-UNROLL: if.end.15: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 15 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_15:%.*]] + ; CHECK-UNROLL: for.cond4.15: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_15:%.*]] + ; CHECK-UNROLL: for.body7.15: + ; CHECK-UNROLL-NEXT: [[TMP168:%.*]] = load ptr, ptr [[ARRAYIDX_15]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP169:%.*]] = load i32, ptr [[TMP168]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP170:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_15:%.*]] = add nsw i32 [[TMP170]], [[TMP169]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_15]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_15:%.*]] + ; CHECK-UNROLL: for.body7.1.15: + ; CHECK-UNROLL-NEXT: [[TMP171:%.*]] = load ptr, ptr [[ARRAYIDX_15]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_15:%.*]] = getelementptr inbounds i32, ptr [[TMP171]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP172:%.*]] = load i32, ptr [[ARRAYIDX11_1_15]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_15:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP173:%.*]] = load i32, ptr [[ARRAYIDX13_1_15]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_15:%.*]] = add nsw i32 [[TMP173]], [[TMP172]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_15]], ptr [[ARRAYIDX13_1_15]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_15:%.*]] + ; CHECK-UNROLL: for.body7.2.15: + ; CHECK-UNROLL-NEXT: [[TMP174:%.*]] = load ptr, ptr [[ARRAYIDX_15]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_15:%.*]] = getelementptr inbounds i32, ptr [[TMP174]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP175:%.*]] = load i32, ptr [[ARRAYIDX11_2_15]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_15:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP176:%.*]] = load i32, ptr [[ARRAYIDX13_2_15]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_15:%.*]] = add nsw i32 [[TMP176]], [[TMP175]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_15]], ptr [[ARRAYIDX13_2_15]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_15:%.*]] + ; CHECK-UNROLL: for.body7.3.15: + ; CHECK-UNROLL-NEXT: [[TMP177:%.*]] = load ptr, ptr [[ARRAYIDX_15]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_15:%.*]] = getelementptr inbounds i32, ptr [[TMP177]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP178:%.*]] = load i32, ptr [[ARRAYIDX11_3_15]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_15:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP179:%.*]] = load i32, ptr [[ARRAYIDX13_3_15]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_15:%.*]] = add nsw i32 [[TMP179]], [[TMP178]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_15]], ptr [[ARRAYIDX13_3_15]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_15:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.15: + ; CHECK-UNROLL-NEXT: br i1 true, label [[CLEANUP]], label [[IF_END_16:%.*]] + ; CHECK-UNROLL: if.end.16: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 16 + ; CHECK-UNROLL-NEXT: br label [[FOR_COND4_16:%.*]] + ; CHECK-UNROLL: for.cond4.16: + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_16:%.*]] + ; CHECK-UNROLL: for.body7.16: + ; CHECK-UNROLL-NEXT: [[TMP180:%.*]] = load ptr, ptr [[ARRAYIDX_16]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP181:%.*]] = load i32, ptr [[TMP180]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP182:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_16:%.*]] = add nsw i32 [[TMP182]], [[TMP181]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_16]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_16:%.*]] + ; CHECK-UNROLL: for.body7.1.16: + ; CHECK-UNROLL-NEXT: [[TMP183:%.*]] = load ptr, ptr [[ARRAYIDX_16]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_16:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP184:%.*]] = load i32, ptr [[ARRAYIDX11_1_16]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_16:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP185:%.*]] = load i32, ptr [[ARRAYIDX13_1_16]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1_16:%.*]] = add nsw i32 [[TMP185]], [[TMP184]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_16]], ptr [[ARRAYIDX13_1_16]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_16:%.*]] + ; CHECK-UNROLL: for.body7.2.16: + ; CHECK-UNROLL-NEXT: [[TMP186:%.*]] = load ptr, ptr [[ARRAYIDX_16]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_16:%.*]] = getelementptr inbounds i32, ptr [[TMP186]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP187:%.*]] = load i32, ptr [[ARRAYIDX11_2_16]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_16:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP188:%.*]] = load i32, ptr [[ARRAYIDX13_2_16]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2_16:%.*]] = add nsw i32 [[TMP188]], [[TMP187]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_16]], ptr [[ARRAYIDX13_2_16]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_16:%.*]] + ; CHECK-UNROLL: for.body7.3.16: + ; CHECK-UNROLL-NEXT: [[TMP189:%.*]] = load ptr, ptr [[ARRAYIDX_16]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_16:%.*]] = getelementptr inbounds i32, ptr [[TMP189]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP190:%.*]] = load i32, ptr [[ARRAYIDX11_3_16]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_16:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP191:%.*]] = load i32, ptr [[ARRAYIDX13_3_16]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3_16:%.*]] = add nsw i32 [[TMP191]], [[TMP190]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_16]], ptr [[ARRAYIDX13_3_16]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_16:%.*]] + ; CHECK-UNROLL: for.cond.cleanup6.16: + ; CHECK-UNROLL-NEXT: unreachable + ; CHECK-UNROLL: for.body7: + ; CHECK-UNROLL-NEXT: [[TMP192:%.*]] = load ptr, ptr [[ARR]], align 8 + ; CHECK-UNROLL-NEXT: [[TMP193:%.*]] = load i32, ptr [[TMP192]], align 4 + ; CHECK-UNROLL-NEXT: [[TMP194:%.*]] = load i32, ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP194]], [[TMP193]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14]], ptr [[OUT]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1:%.*]] + ; CHECK-UNROLL: for.body7.1: + ; CHECK-UNROLL-NEXT: [[TMP195:%.*]] = load ptr, ptr [[ARR]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1:%.*]] = getelementptr inbounds i32, ptr [[TMP195]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP196:%.*]] = load i32, ptr [[ARRAYIDX11_1]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 + ; CHECK-UNROLL-NEXT: [[TMP197:%.*]] = load i32, ptr [[ARRAYIDX13_1]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_1:%.*]] = add nsw i32 [[TMP197]], [[TMP196]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_1]], ptr [[ARRAYIDX13_1]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2:%.*]] + ; CHECK-UNROLL: for.body7.2: + ; CHECK-UNROLL-NEXT: [[TMP198:%.*]] = load ptr, ptr [[ARR]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2:%.*]] = getelementptr inbounds i32, ptr [[TMP198]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP199:%.*]] = load i32, ptr [[ARRAYIDX11_2]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 + ; CHECK-UNROLL-NEXT: [[TMP200:%.*]] = load i32, ptr [[ARRAYIDX13_2]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_2:%.*]] = add nsw i32 [[TMP200]], [[TMP199]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_2]], ptr [[ARRAYIDX13_2]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3:%.*]] + ; CHECK-UNROLL: for.body7.3: + ; CHECK-UNROLL-NEXT: [[TMP201:%.*]] = load ptr, ptr [[ARR]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3:%.*]] = getelementptr inbounds i32, ptr [[TMP201]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP202:%.*]] = load i32, ptr [[ARRAYIDX11_3]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 + ; CHECK-UNROLL-NEXT: [[TMP203:%.*]] = load i32, ptr [[ARRAYIDX13_3]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_3:%.*]] = add nsw i32 [[TMP203]], [[TMP202]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_3]], ptr [[ARRAYIDX13_3]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6:%.*]] + ; CHECK-UNROLL: for.body7.4: + ; CHECK-UNROLL-NEXT: [[ARRAYIDX_LCSSA:%.*]] = phi ptr [ [[ARR]], [[FOR_BODY7_3]] ], [ [[ARRAYIDX_1]], [[FOR_BODY7_3_1]] ], [ [[ARRAYIDX_2]], [[FOR_BODY7_3_2]] ], [ [[ARRAYIDX_3]], [[FOR_BODY7_3_3]] ], [ [[ARRAYIDX_4]], [[FOR_BODY7_3_4]] ], [ [[ARRAYIDX_5]], [[FOR_BODY7_3_5]] ], [ [[ARRAYIDX_6]], [[FOR_BODY7_3_6]] ], [ [[ARRAYIDX_7]], [[FOR_BODY7_3_7]] ], [ [[ARRAYIDX_8]], [[FOR_BODY7_3_8]] ], [ [[ARRAYIDX_9]], [[FOR_BODY7_3_9]] ], [ [[ARRAYIDX_10]], [[FOR_BODY7_3_10]] ], [ [[ARRAYIDX_11]], [[FOR_BODY7_3_11]] ], [ [[ARRAYIDX_12]], [[FOR_BODY7_3_12]] ], [ [[ARRAYIDX_13]], [[FOR_BODY7_3_13]] ], [ [[ARRAYIDX_14]], [[FOR_BODY7_3_14]] ], [ [[ARRAYIDX_15]], [[FOR_BODY7_3_15]] ], [ [[ARRAYIDX_16]], [[FOR_BODY7_3_16]] ] + ; CHECK-UNROLL-NEXT: [[TMP204:%.*]] = load ptr, ptr [[ARRAYIDX_LCSSA]], align 8 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX11_4:%.*]] = getelementptr inbounds i32, ptr [[TMP204]], i64 4 + ; CHECK-UNROLL-NEXT: [[TMP205:%.*]] = load i32, ptr [[ARRAYIDX11_4]], align 4 + ; CHECK-UNROLL-NEXT: [[ARRAYIDX13_4:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 4 + ; CHECK-UNROLL-NEXT: [[TMP206:%.*]] = load i32, ptr [[ARRAYIDX13_4]], align 4 + ; CHECK-UNROLL-NEXT: [[ADD14_4:%.*]] = add nsw i32 [[TMP206]], [[TMP205]] + ; CHECK-UNROLL-NEXT: store i32 [[ADD14_4]], ptr [[ARRAYIDX13_4]], align 4 + ; CHECK-UNROLL-NEXT: call void @_Z3barv() + ; CHECK-UNROLL-NEXT: unreachable + ; CHECK-UNROLL: cleanup: + ; CHECK-UNROLL-NEXT: ret void + ; + entry: + br label %for.cond + + for.cond: ; preds = %for.cond.cleanup6, %entry + %Dim.0 = phi i32 [ 0, %entry ], [ %inc16, %for.cond.cleanup6 ] + %Idx.addr.0 = phi i32 [ %Idx, %entry ], [ %add, %for.cond.cleanup6 ] + %cmp = icmp slt i32 %Dim.0, 16 + br i1 %cmp, label %for.body, label %for.cond.cleanup + + for.cond.cleanup: ; preds = %for.cond + br label %cleanup + + for.body: ; preds = %for.cond + %cmp1 = icmp eq i32 %Dim.0, %Dims + br i1 %cmp1, label %if.then, label %if.end + + if.then: ; preds = %for.body + br label %cleanup + + if.end: ; preds = %for.body + %idxprom = sext i32 %Dim.0 to i64 + %arrayidx = getelementptr inbounds ptr, ptr %Arr, i64 %idxprom + %0 = load ptr, ptr %arrayidx, align 8 + %idxprom2 = sext i32 %Idx.addr.0 to i64 + %arrayidx3 = getelementptr inbounds i32, ptr %0, i64 %idxprom2 + %1 = load i32, ptr %arrayidx3, align 4 + %add = add nsw i32 %1, 1 + br label %for.cond4 + + for.cond4: ; preds = %for.body7, %if.end + %arg.0 = phi i32 [ 0, %if.end ], [ %inc, %for.body7 ] + %cmp5 = icmp slt i32 %arg.0, 4 + br i1 %cmp5, label %for.body7, label %for.cond.cleanup6 + + for.cond.cleanup6: ; preds = %for.cond4 + %inc16 = add nsw i32 %Dim.0, 1 + br label %for.cond, !llvm.loop !0 + + for.body7: ; preds = %for.cond4 + %2 = load ptr, ptr %arrayidx, align 8 + %idxprom10 = sext i32 %arg.0 to i64 + %arrayidx11 = getelementptr inbounds i32, ptr %2, i64 %idxprom10 + %3 = load i32, ptr %arrayidx11, align 4 + %arrayidx13 = getelementptr inbounds i32, ptr %Out, i64 %idxprom10 + %4 = load i32, ptr %arrayidx13, align 4 + %add14 = add nsw i32 %4, %3 + store i32 %add14, ptr %arrayidx13, align 4 + call void @_Z3barv() + %inc = add nsw i32 %arg.0, 1 + br label %for.cond4, !llvm.loop !3 + + cleanup: ; preds = %if.then, %for.cond.cleanup + ret void + } + + declare void @_Z3barv() + + !0 = distinct !{!0, !1, !2} + !1 = !{!"llvm.loop.mustprogress"} + !2 = !{!"llvm.loop.unroll.enable"} + !3 = distinct !{!3, !1} + ;. + ; CHECK-CFG: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} + ; CHECK-CFG: [[META1]] = !{!"llvm.loop.mustprogress"} + ; CHECK-CFG: [[META2]] = !{!"llvm.loop.unroll.enable"} + ; CHECK-CFG: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} + ;. + diff --git a/llvm/test/Transforms/LoopUnroll/unroll-cleanup.ll b/llvm/test/Transforms/LoopUnroll/unroll-cleanup.ll index 40d6ddc27370e..9f055007cdd90 100644 --- a/llvm/test/Transforms/LoopUnroll/unroll-cleanup.ll +++ b/llvm/test/Transforms/LoopUnroll/unroll-cleanup.ll @@ -23,38 +23,38 @@ define void @_Z3fn1v(ptr %r, ptr %a) #0 { ; CHECK-LABEL: define void @_Z3fn1v( ; CHECK-SAME: ptr nocapture writeonly [[R:%.*]], ptr nocapture readonly [[A:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr @b, align 4 -; CHECK-NEXT: [[TOBOOL20:%.*]] = icmp eq i32 [[TMP]], 0 +; CHECK-NEXT: [[T:%.*]] = load i32, ptr @b, align 4 +; CHECK-NEXT: [[TOBOOL20:%.*]] = icmp eq i32 [[T]], 0 ; CHECK-NEXT: br i1 [[TOBOOL20]], label %[[FOR_END6:.*]], label %[[FOR_BODY:.*]] ; CHECK: [[FOR_COND_LOOPEXIT_LOOPEXIT:.*]]: ; CHECK-NEXT: [[ADD_PTR_LCSSA:%.*]] = phi ptr [ [[ADD_PTR_LCSSA_UNR:%.*]], %[[FOR_BODY3_PROL_LOOPEXIT:.*]] ], [ [[ADD_PTR_1:%.*]], %[[FOR_INC_1:.*]] ] ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A_021:%.*]], i64 1 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[TMP1:%.*]] -; CHECK-NEXT: [[TMP1_PRE:%.*]] = load i32, ptr @b, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[T2:%.*]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[T1_PRE:%.*]] = load i32, ptr @b, align 4 ; CHECK-NEXT: br label %[[FOR_COND_LOOPEXIT:.*]] ; CHECK: [[FOR_COND_LOOPEXIT]]: -; CHECK-NEXT: [[T1:%.*]] = phi i32 [ [[T12:%.*]], %[[FOR_BODY]] ], [ [[TMP1_PRE]], %[[FOR_COND_LOOPEXIT_LOOPEXIT]] ] +; CHECK-NEXT: [[T1:%.*]] = phi i32 [ [[T12:%.*]], %[[FOR_BODY]] ], [ [[T1_PRE]], %[[FOR_COND_LOOPEXIT_LOOPEXIT]] ] ; CHECK-NEXT: [[R_1_LCSSA:%.*]] = phi ptr [ [[R_022:%.*]], %[[FOR_BODY]] ], [ [[ADD_PTR_LCSSA]], %[[FOR_COND_LOOPEXIT_LOOPEXIT]] ] -; CHECK-NEXT: [[A_1_LCSSA:%.*]] = phi ptr [ [[A_021]], %[[FOR_BODY]] ], [ [[SCEVGEP1]], %[[FOR_COND_LOOPEXIT_LOOPEXIT]] ] +; CHECK-NEXT: [[A_1_LCSSA:%.*]] = phi ptr [ [[A_021:%.*]], %[[FOR_BODY]] ], [ [[SCEVGEP1]], %[[FOR_COND_LOOPEXIT_LOOPEXIT]] ] ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[T1]], 0 ; CHECK-NEXT: br i1 [[TOBOOL]], label %[[FOR_END6]], label %[[FOR_BODY]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[T12]] = phi i32 [ [[T1]], %[[FOR_COND_LOOPEXIT]] ], [ [[TMP]], %[[ENTRY]] ] +; CHECK-NEXT: [[T12]] = phi i32 [ [[T1]], %[[FOR_COND_LOOPEXIT]] ], [ [[T]], %[[ENTRY]] ] ; CHECK-NEXT: [[R_022]] = phi ptr [ [[R_1_LCSSA]], %[[FOR_COND_LOOPEXIT]] ], [ [[R]], %[[ENTRY]] ] ; CHECK-NEXT: [[A_021]] = phi ptr [ [[A_1_LCSSA]], %[[FOR_COND_LOOPEXIT]] ], [ [[A]], %[[ENTRY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @c, align 4 -; CHECK-NEXT: [[TOBOOL215:%.*]] = icmp eq i32 [[TMP2]], 0 +; CHECK-NEXT: [[T2]] = load i32, ptr @c, align 4 +; CHECK-NEXT: [[TOBOOL215:%.*]] = icmp eq i32 [[T2]], 0 ; CHECK-NEXT: br i1 [[TOBOOL215]], label %[[FOR_COND_LOOPEXIT]], label %[[FOR_BODY3_PREHEADER:.*]] ; CHECK: [[FOR_BODY3_PREHEADER]]: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[TMP2]], -1 -; CHECK-NEXT: [[TMP1]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[TMP2]], 1 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[T2]], 1 ; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 0 ; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label %[[FOR_BODY3_PROL_LOOPEXIT]], label %[[FOR_BODY3_PROL:.*]] ; CHECK: [[FOR_BODY3_PROL]]: -; CHECK-NEXT: [[DEC18_PROL:%.*]] = add nsw i32 [[TMP2]], -1 -; CHECK-NEXT: [[TMP3_PROL:%.*]] = load i8, ptr [[A_021]], align 1 -; CHECK-NEXT: [[CMP_PROL:%.*]] = icmp eq i8 [[TMP3_PROL]], 0 +; CHECK-NEXT: [[DEC18_PROL:%.*]] = add nsw i32 [[T2]], -1 +; CHECK-NEXT: [[T3_PROL:%.*]] = load i8, ptr [[A_021]], align 1 +; CHECK-NEXT: [[CMP_PROL:%.*]] = icmp eq i8 [[T3_PROL]], 0 ; CHECK-NEXT: br i1 [[CMP_PROL]], label %[[IF_THEN_PROL:.*]], label %[[FOR_INC_PROL:.*]] ; CHECK: [[IF_THEN_PROL]]: ; CHECK-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[R_022]], i64 2 @@ -69,17 +69,17 @@ define void @_Z3fn1v(ptr %r, ptr %a) #0 { ; CHECK-NEXT: br label %[[FOR_BODY3_PROL_LOOPEXIT]] ; CHECK: [[FOR_BODY3_PROL_LOOPEXIT]]: ; CHECK-NEXT: [[ADD_PTR_LCSSA_UNR]] = phi ptr [ poison, %[[FOR_BODY3_PREHEADER]] ], [ [[ADD_PTR_PROL]], %[[FOR_INC_PROL]] ] -; CHECK-NEXT: [[DEC18_IN_UNR:%.*]] = phi i32 [ [[TMP2]], %[[FOR_BODY3_PREHEADER]] ], [ [[DEC18_PROL]], %[[FOR_INC_PROL]] ] +; CHECK-NEXT: [[DEC18_IN_UNR:%.*]] = phi i32 [ [[T2]], %[[FOR_BODY3_PREHEADER]] ], [ [[DEC18_PROL]], %[[FOR_INC_PROL]] ] ; CHECK-NEXT: [[R_117_UNR:%.*]] = phi ptr [ [[R_022]], %[[FOR_BODY3_PREHEADER]] ], [ [[ADD_PTR_PROL]], %[[FOR_INC_PROL]] ] ; CHECK-NEXT: [[A_116_UNR:%.*]] = phi ptr [ [[A_021]], %[[FOR_BODY3_PREHEADER]] ], [ [[INCDEC_PTR_PROL]], %[[FOR_INC_PROL]] ] -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP0]], 0 -; CHECK-NEXT: br i1 [[TMP4]], label %[[FOR_COND_LOOPEXIT_LOOPEXIT]], label %[[FOR_BODY3:.*]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[T2]], 1 +; CHECK-NEXT: br i1 [[TMP2]], label %[[FOR_COND_LOOPEXIT_LOOPEXIT]], label %[[FOR_BODY3:.*]] ; CHECK: [[FOR_BODY3]]: ; CHECK-NEXT: [[DEC18_IN:%.*]] = phi i32 [ [[DEC18_1:%.*]], %[[FOR_INC_1]] ], [ [[DEC18_IN_UNR]], %[[FOR_BODY3_PROL_LOOPEXIT]] ] ; CHECK-NEXT: [[R_117:%.*]] = phi ptr [ [[ADD_PTR_1]], %[[FOR_INC_1]] ], [ [[R_117_UNR]], %[[FOR_BODY3_PROL_LOOPEXIT]] ] ; CHECK-NEXT: [[A_116:%.*]] = phi ptr [ [[INCDEC_PTR_1:%.*]], %[[FOR_INC_1]] ], [ [[A_116_UNR]], %[[FOR_BODY3_PROL_LOOPEXIT]] ] -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[A_116]], align 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP3]], 0 +; CHECK-NEXT: [[T3:%.*]] = load i8, ptr [[A_116]], align 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[T3]], 0 ; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[FOR_INC:.*]] ; CHECK: [[IF_THEN]]: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[R_117]], i64 2 @@ -91,8 +91,8 @@ define void @_Z3fn1v(ptr %r, ptr %a) #0 { ; CHECK: [[FOR_INC]]: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A_116]], i64 1 ; CHECK-NEXT: [[DEC18_1]] = add nsw i32 [[DEC18_IN]], -2 -; CHECK-NEXT: [[TMP3_1:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1 -; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq i8 [[TMP3_1]], 0 +; CHECK-NEXT: [[T3_1:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1 +; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq i8 [[T3_1]], 0 ; CHECK-NEXT: br i1 [[CMP_1]], label %[[IF_THEN_1:.*]], label %[[FOR_INC_1]] ; CHECK: [[IF_THEN_1]]: ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[R_117]], i64 6 diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/assert-vplan-cost-model.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/assert-vplan-cost-model.ll new file mode 100644 index 0000000000000..b00888f4b61e0 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/assert-vplan-cost-model.ll @@ -0,0 +1,141 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=loop-vectorize -amdgpu-coerce-illegal-types=1 < %s -S -o - | FileCheck %s + +; REQUIRES: asserts + +target triple = "amdgcn-amd-amdhsa" + +; Function Attrs: mustprogress nofree norecurse nosync nounwind memory(argmem: readwrite) +define protected amdgpu_kernel void @func_int8(ptr addrspace(1) %p_a_grid.coerce, ptr addrspace(1) %p_b_grid.coerce, ptr addrspace(1) %p_c_grid.coerce, i32 %m, i32 %n, i32 %k, i1 %c, i32 %add, i32 %add12) { +; CHECK-LABEL: define protected amdgpu_kernel void @func_int8( +; CHECK-SAME: ptr addrspace(1) [[P_A_GRID_COERCE:%.*]], ptr addrspace(1) [[P_B_GRID_COERCE:%.*]], ptr addrspace(1) [[P_C_GRID_COERCE:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], i32 [[K:%.*]], i1 [[C:%.*]], i32 [[ADD:%.*]], i32 [[ADD12:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[C]], label %[[FOR_COND_PREHEADER:.*]], label %[[IF_END:.*]] +; CHECK: [[FOR_COND_PREHEADER]]: +; CHECK-NEXT: [[CMP1444:%.*]] = icmp sgt i32 [[K]], 0 +; CHECK-NEXT: br i1 [[CMP1444]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[FOR_BODY_LR_PH]]: +; CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 [[ADD]], [[K]] +; CHECK-NEXT: [[MUL17:%.*]] = mul nsw i32 [[ADD12]], [[K]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[K]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[TMP0]], [[MUL15]] +; CHECK-NEXT: [[TMP2:%.*]] = add nsw i32 [[TMP0]], [[MUL17]] +; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P_A_GRID_COERCE]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr addrspace(1) [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP2]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P_B_GRID_COERCE]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i8>, ptr addrspace(1) [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = zext <2 x i8> [[WIDE_LOAD]] to <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = zext <2 x i8> [[WIDE_LOAD1]] to <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw <2 x i32> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12]] = add <2 x i32> [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP12]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; CHECK-NEXT: [[ADD24_LCSSA:%.*]] = phi i32 [ [[ADD24:%.*]], %[[FOR_BODY]] ], [ [[TMP14]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP15:%.*]] = trunc i32 [[ADD24_LCSSA]] to i8 +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[V_ACC_0_LCSSA:%.*]] = phi i8 [ 0, %[[FOR_COND_PREHEADER]] ], [ [[TMP15]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[MUL25:%.*]] = mul nsw i32 [[ADD]], [[N]] +; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[ADD12]], [[MUL25]] +; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64 +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P_C_GRID_COERCE]], i64 [[IDXPROM27]] +; CHECK-NEXT: store i8 [[V_ACC_0_LCSSA]], ptr addrspace(1) [[ARRAYIDX28]], align 1 +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[K_IDX_046:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[V_ACC_045:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD24]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[K_IDX_046]], [[MUL15]] +; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[K_IDX_046]], [[MUL17]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD16]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P_A_GRID_COERCE]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX_VAL:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[ADD18]] to i64 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P_B_GRID_COERCE]], i64 [[IDXPROM19]] +; CHECK-NEXT: [[ARRAYIDX20_VAL:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX20]], align 1 +; CHECK-NEXT: [[CONV_I47:%.*]] = zext i8 [[ARRAYIDX_VAL]] to i32 +; CHECK-NEXT: [[CONV_I4248:%.*]] = zext i8 [[ARRAYIDX20_VAL]] to i32 +; CHECK-NEXT: [[MUL23:%.*]] = mul nuw nsw i32 [[CONV_I4248]], [[CONV_I47]] +; CHECK-NEXT: [[ADD24]] = add i32 [[MUL23]], [[V_ACC_045]] +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[K_IDX_046]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[K]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %for.cond.preheader, label %if.end + +for.cond.preheader: ; preds = %entry + %cmp1444 = icmp sgt i32 %k, 0 + br i1 %cmp1444, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %for.cond.preheader + %mul15 = mul nsw i32 %add, %k + %mul17 = mul nsw i32 %add12, %k + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %add24.lcssa = phi i32 [ %add24, %for.body ] + %17 = trunc i32 %add24.lcssa to i8 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %for.cond.preheader + %v_acc.0.lcssa = phi i8 [ 0, %for.cond.preheader ], [ %17, %for.cond.cleanup.loopexit ] + %mul25 = mul nsw i32 %add, %n + %add26 = add nsw i32 %add12, %mul25 + %idxprom27 = sext i32 %add26 to i64 + %arrayidx28 = getelementptr inbounds i8, ptr addrspace(1) %p_c_grid.coerce, i64 %idxprom27 + store i8 %v_acc.0.lcssa, ptr addrspace(1) %arrayidx28, align 1 + br label %if.end + +for.body: ; preds = %for.body, %for.body.lr.ph + %k_idx.046 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %v_acc.045 = phi i32 [ 0, %for.body.lr.ph ], [ %add24, %for.body ] + %add16 = add nsw i32 %k_idx.046, %mul15 + %add18 = add nsw i32 %k_idx.046, %mul17 + %idxprom = sext i32 %add16 to i64 + %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %p_a_grid.coerce, i64 %idxprom + %arrayidx.val = load i8, ptr addrspace(1) %arrayidx, align 1 + %idxprom19 = sext i32 %add18 to i64 + %arrayidx20 = getelementptr inbounds i8, ptr addrspace(1) %p_b_grid.coerce, i64 %idxprom19 + %arrayidx20.val = load i8, ptr addrspace(1) %arrayidx20, align 1 + %conv.i47 = zext i8 %arrayidx.val to i32 + %conv.i4248 = zext i8 %arrayidx20.val to i32 + %mul23 = mul nuw nsw i32 %conv.i4248, %conv.i47 + %add24 = add i32 %mul23, %v_acc.045 + %inc = add nuw nsw i32 %k_idx.046, 1 + %exitcond.not = icmp eq i32 %inc, %k + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body + +if.end: ; preds = %for.cond.cleanup, %entry + ret void +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META1]]} +; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +;. diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll index 778f25f5620f2..a13c36f693bac 100644 --- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll +++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll @@ -13,25 +13,25 @@ define void @arm_mean_q7(ptr noundef %pSrc, i32 noundef %blockSize, ptr noundef ; CHECK-NEXT: br i1 [[CMP_NOT10]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK: while.body.preheader: ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[BLOCKSIZE]], 4 -; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[BLOCKSIZE]], -16 ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: -; CHECK-NEXT: [[SUM_013:%.*]] = phi i32 [ [[TMP3:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[SUM_013:%.*]] = phi i32 [ [[TMP2:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: [[PSRC_ADDR_012:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRC:%.*]], [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: [[BLKCNT_011:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[SHR]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[PSRC_ADDR_012]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[TMP1]], i32 0) -; CHECK-NEXT: [[TMP3]] = add i32 [[TMP2]], [[SUM_013]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[PSRC_ADDR_012]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[TMP0]], i32 0) +; CHECK-NEXT: [[TMP2]] = add i32 [[TMP1]], [[SUM_013]] ; CHECK-NEXT: [[DEC]] = add nsw i32 [[BLKCNT_011]], -1 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds nuw i8, ptr [[PSRC_ADDR_012]], i32 16 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] ; CHECK: while.end.loopexit: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[PSRC]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[BLOCKSIZE]], -16 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[PSRC]], i32 [[TMP3]] ; CHECK-NEXT: br label [[WHILE_END]] ; CHECK: while.end: ; CHECK-NEXT: [[PSRC_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PSRC]], [[ENTRY:%.*]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ] -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP3]], [[WHILE_END_LOOPEXIT]] ] +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP2]], [[WHILE_END_LOOPEXIT]] ] ; CHECK-NEXT: [[AND:%.*]] = and i32 [[BLOCKSIZE]], 15 ; CHECK-NEXT: [[CMP2_NOT15:%.*]] = icmp eq i32 [[AND]], 0 ; CHECK-NEXT: br i1 [[CMP2_NOT15]], label [[WHILE_END5:%.*]], label [[MIDDLE_BLOCK:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll index 3749bdf1bba39..2cd29c5241186 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll @@ -363,11 +363,96 @@ bb: ret <4 x i16> %ins.3 } +define <4 x i8> @uadd_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) { +; GCN-LABEL: @uadd_sat_v4i8( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <4 x i8> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <4 x i8> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i8> [[ARG0]], i64 2 +; GCN-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i8> [[ARG0]], i64 3 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <4 x i8> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <4 x i8> [[ARG1]], i64 1 +; GCN-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i8> [[ARG1]], i64 2 +; GCN-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i8> [[ARG1]], i64 3 +; GCN-NEXT: [[ADD_0:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[ARG0_0]], i8 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[ARG0_1]], i8 [[ARG1_1]]) +; GCN-NEXT: [[ADD_2:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[ARG0_2]], i8 [[ARG1_2]]) +; GCN-NEXT: [[ADD_3:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[ARG0_3]], i8 [[ARG1_3]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <4 x i8> poison, i8 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <4 x i8> [[INS_0]], i8 [[ADD_1]], i64 1 +; GCN-NEXT: [[INS_2:%.*]] = insertelement <4 x i8> [[INS_1]], i8 [[ADD_2]], i64 2 +; GCN-NEXT: [[INS_3:%.*]] = insertelement <4 x i8> [[INS_2]], i8 [[ADD_3]], i64 3 +; GCN-NEXT: ret <4 x i8> [[INS_3]] +; +bb: + %arg0.0 = extractelement <4 x i8> %arg0, i64 0 + %arg0.1 = extractelement <4 x i8> %arg0, i64 1 + %arg0.2 = extractelement <4 x i8> %arg0, i64 2 + %arg0.3 = extractelement <4 x i8> %arg0, i64 3 + %arg1.0 = extractelement <4 x i8> %arg1, i64 0 + %arg1.1 = extractelement <4 x i8> %arg1, i64 1 + %arg1.2 = extractelement <4 x i8> %arg1, i64 2 + %arg1.3 = extractelement <4 x i8> %arg1, i64 3 + %add.0 = call i8 @llvm.uadd.sat.i8(i8 %arg0.0, i8 %arg1.0) + %add.1 = call i8 @llvm.uadd.sat.i8(i8 %arg0.1, i8 %arg1.1) + %add.2 = call i8 @llvm.uadd.sat.i8(i8 %arg0.2, i8 %arg1.2) + %add.3 = call i8 @llvm.uadd.sat.i8(i8 %arg0.3, i8 %arg1.3) + %ins.0 = insertelement <4 x i8> poison, i8 %add.0, i64 0 + %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1 + %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2 + %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3 + ret <4 x i8> %ins.3 +} + +define <4 x i8> @usub_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) { +; GCN-LABEL: @usub_sat_v4i8( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <4 x i8> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <4 x i8> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i8> [[ARG0]], i64 2 +; GCN-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i8> [[ARG0]], i64 3 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <4 x i8> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <4 x i8> [[ARG1]], i64 1 +; GCN-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i8> [[ARG1]], i64 2 +; GCN-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i8> [[ARG1]], i64 3 +; GCN-NEXT: [[ADD_0:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[ARG0_0]], i8 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[ARG0_1]], i8 [[ARG1_1]]) +; GCN-NEXT: [[ADD_2:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[ARG0_2]], i8 [[ARG1_2]]) +; GCN-NEXT: [[ADD_3:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[ARG0_3]], i8 [[ARG1_3]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <4 x i8> poison, i8 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <4 x i8> [[INS_0]], i8 [[ADD_1]], i64 1 +; GCN-NEXT: [[INS_2:%.*]] = insertelement <4 x i8> [[INS_1]], i8 [[ADD_2]], i64 2 +; GCN-NEXT: [[INS_3:%.*]] = insertelement <4 x i8> [[INS_2]], i8 [[ADD_3]], i64 3 +; GCN-NEXT: ret <4 x i8> [[INS_3]] +; +bb: + %arg0.0 = extractelement <4 x i8> %arg0, i64 0 + %arg0.1 = extractelement <4 x i8> %arg0, i64 1 + %arg0.2 = extractelement <4 x i8> %arg0, i64 2 + %arg0.3 = extractelement <4 x i8> %arg0, i64 3 + %arg1.0 = extractelement <4 x i8> %arg1, i64 0 + %arg1.1 = extractelement <4 x i8> %arg1, i64 1 + %arg1.2 = extractelement <4 x i8> %arg1, i64 2 + %arg1.3 = extractelement <4 x i8> %arg1, i64 3 + %add.0 = call i8 @llvm.usub.sat.i8(i8 %arg0.0, i8 %arg1.0) + %add.1 = call i8 @llvm.usub.sat.i8(i8 %arg0.1, i8 %arg1.1) + %add.2 = call i8 @llvm.usub.sat.i8(i8 %arg0.2, i8 %arg1.2) + %add.3 = call i8 @llvm.usub.sat.i8(i8 %arg0.3, i8 %arg1.3) + %ins.0 = insertelement <4 x i8> poison, i8 %add.0, i64 0 + %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1 + %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2 + %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3 + ret <4 x i8> %ins.3 +} + declare i16 @llvm.uadd.sat.i16(i16, i16) #0 declare i16 @llvm.usub.sat.i16(i16, i16) #0 declare i16 @llvm.sadd.sat.i16(i16, i16) #0 declare i16 @llvm.ssub.sat.i16(i16, i16) #0 +declare i8 @llvm.uadd.sat.i8(i8, i8) #0 +declare i8 @llvm.usub.sat.i8(i8, i8) #0 + declare i32 @llvm.uadd.sat.i32(i32, i32) #0 declare i32 @llvm.usub.sat.i32(i32, i32) #0 declare i32 @llvm.sadd.sat.i32(i32, i32) #0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll index 0bb641371825b..4653e47f4a5f3 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -363,11 +363,97 @@ bb: ret <4 x i16> %ins.3 } +define <4 x i8> @uadd_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1, ptr addrspace(1) %dst) { +; GCN-LABEL: @uadd_sat_v4i8( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <4 x i8> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <4 x i8> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i8> [[ARG0]], i64 2 +; GCN-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i8> [[ARG0]], i64 3 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <4 x i8> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <4 x i8> [[ARG1]], i64 1 +; GCN-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i8> [[ARG1]], i64 2 +; GCN-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i8> [[ARG1]], i64 3 +; GCN-NEXT: [[ADD_0:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[ARG0_0]], i8 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[ARG0_1]], i8 [[ARG1_1]]) +; GCN-NEXT: [[ADD_2:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[ARG0_2]], i8 [[ARG1_2]]) +; GCN-NEXT: [[ADD_3:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[ARG0_3]], i8 [[ARG1_3]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <4 x i8> poison, i8 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <4 x i8> [[INS_0]], i8 [[ADD_1]], i64 1 +; GCN-NEXT: [[INS_2:%.*]] = insertelement <4 x i8> [[INS_1]], i8 [[ADD_2]], i64 2 +; GCN-NEXT: [[INS_3:%.*]] = insertelement <4 x i8> [[INS_2]], i8 [[ADD_3]], i64 3 +; GCN-NEXT: ret <4 x i8> [[INS_3]] +; +bb: + %arg0.0 = extractelement <4 x i8> %arg0, i64 0 + %arg0.1 = extractelement <4 x i8> %arg0, i64 1 + %arg0.2 = extractelement <4 x i8> %arg0, i64 2 + %arg0.3 = extractelement <4 x i8> %arg0, i64 3 + %arg1.0 = extractelement <4 x i8> %arg1, i64 0 + %arg1.1 = extractelement <4 x i8> %arg1, i64 1 + %arg1.2 = extractelement <4 x i8> %arg1, i64 2 + %arg1.3 = extractelement <4 x i8> %arg1, i64 3 + %add.0 = call i8 @llvm.uadd.sat.i8(i8 %arg0.0, i8 %arg1.0) + %add.1 = call i8 @llvm.uadd.sat.i8(i8 %arg0.1, i8 %arg1.1) + %add.2 = call i8 @llvm.uadd.sat.i8(i8 %arg0.2, i8 %arg1.2) + %add.3 = call i8 @llvm.uadd.sat.i8(i8 %arg0.3, i8 %arg1.3) + %ins.0 = insertelement <4 x i8> undef, i8 %add.0, i64 0 + %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1 + %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2 + %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3 + ret <4 x i8> %ins.3 +} +define <4 x i8> @usub_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) { +; GCN-LABEL: @usub_sat_v4i8( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <4 x i8> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <4 x i8> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i8> [[ARG0]], i64 2 +; GCN-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i8> [[ARG0]], i64 3 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <4 x i8> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <4 x i8> [[ARG1]], i64 1 +; GCN-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i8> [[ARG1]], i64 2 +; GCN-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i8> [[ARG1]], i64 3 +; GCN-NEXT: [[ADD_0:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[ARG0_0]], i8 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[ARG0_1]], i8 [[ARG1_1]]) +; GCN-NEXT: [[ADD_2:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[ARG0_2]], i8 [[ARG1_2]]) +; GCN-NEXT: [[ADD_3:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[ARG0_3]], i8 [[ARG1_3]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <4 x i8> poison, i8 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <4 x i8> [[INS_0]], i8 [[ADD_1]], i64 1 +; GCN-NEXT: [[INS_2:%.*]] = insertelement <4 x i8> [[INS_1]], i8 [[ADD_2]], i64 2 +; GCN-NEXT: [[INS_3:%.*]] = insertelement <4 x i8> [[INS_2]], i8 [[ADD_3]], i64 3 +; GCN-NEXT: ret <4 x i8> [[INS_3]] +; +bb: + %arg0.0 = extractelement <4 x i8> %arg0, i64 0 + %arg0.1 = extractelement <4 x i8> %arg0, i64 1 + %arg0.2 = extractelement <4 x i8> %arg0, i64 2 + %arg0.3 = extractelement <4 x i8> %arg0, i64 3 + %arg1.0 = extractelement <4 x i8> %arg1, i64 0 + %arg1.1 = extractelement <4 x i8> %arg1, i64 1 + %arg1.2 = extractelement <4 x i8> %arg1, i64 2 + %arg1.3 = extractelement <4 x i8> %arg1, i64 3 + %add.0 = call i8 @llvm.usub.sat.i8(i8 %arg0.0, i8 %arg1.0) + %add.1 = call i8 @llvm.usub.sat.i8(i8 %arg0.1, i8 %arg1.1) + %add.2 = call i8 @llvm.usub.sat.i8(i8 %arg0.2, i8 %arg1.2) + %add.3 = call i8 @llvm.usub.sat.i8(i8 %arg0.3, i8 %arg1.3) + %ins.0 = insertelement <4 x i8> undef, i8 %add.0, i64 0 + %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1 + %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2 + %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3 + ret <4 x i8> %ins.3 + +} + + declare i16 @llvm.uadd.sat.i16(i16, i16) #0 declare i16 @llvm.usub.sat.i16(i16, i16) #0 declare i16 @llvm.sadd.sat.i16(i16, i16) #0 declare i16 @llvm.ssub.sat.i16(i16, i16) #0 +declare i8 @llvm.uadd.sat.i8(i8, i8) #0 +declare i8 @llvm.usub.sat.i8(i8, i8) #0 + declare i32 @llvm.uadd.sat.i32(i32, i32) #0 declare i32 @llvm.usub.sat.i32(i32, i32) #0 declare i32 @llvm.sadd.sat.i32(i32, i32) #0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll new file mode 100644 index 0000000000000..f3e89b60b8045 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll @@ -0,0 +1,221 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,dce < %s | FileCheck -check-prefixes=GCN %s + +; The insertelements in the exit block use the various parts of the vectorized tree. These external uses are just creating an identity vector using a sequence +; of insert elements. Since these insertelements are just recreating the same vectors that were produced during vectorization, they should not increase the cost of vectorization. + +define void @phi_4(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr %out1, ptr %out2, i32 %flag) { +; GCN-LABEL: define void @phi_4( +; GCN-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], ptr [[OUT2:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] { +; GCN-NEXT: [[ENTRY:.*]]: +; GCN-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8 +; GCN-NEXT: [[GEP2:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2 +; GCN-NEXT: [[GEP4:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 4 +; GCN-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8 +; GCN-NEXT: [[GEP6:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 6 +; GCN-NEXT: [[TMP3:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2 +; GCN-NEXT: [[GEP8:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 8 +; GCN-NEXT: [[TMP4:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8 +; GCN-NEXT: [[GEP10:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 10 +; GCN-NEXT: [[TMP5:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2 +; GCN-NEXT: [[GEP12:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 12 +; GCN-NEXT: [[TMP6:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8 +; GCN-NEXT: [[GEP14:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 14 +; GCN-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2 +; GCN-NEXT: br label %[[DO_BODY:.*]] +; GCN: [[DO_BODY]]: +; GCN-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP16:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP9:%.*]] = phi <2 x i16> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP17:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP10:%.*]] = phi <2 x i16> [ [[TMP2]], %[[ENTRY]] ], [ [[TMP18:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP11:%.*]] = phi <2 x i16> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP19:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP12:%.*]] = phi <2 x i16> [ [[TMP4]], %[[ENTRY]] ], [ [[TMP20:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP21:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP6]], %[[ENTRY]] ], [ [[TMP22:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP42:%.*]] = phi <2 x i16> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP23:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP16]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8 +; GCN-NEXT: [[TMP17]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2 +; GCN-NEXT: [[TMP18]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8 +; GCN-NEXT: [[TMP19]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2 +; GCN-NEXT: [[TMP20]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8 +; GCN-NEXT: [[TMP21]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2 +; GCN-NEXT: [[TMP22]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8 +; GCN-NEXT: [[TMP23]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2 +; GCN-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0 +; GCN-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]] +; GCN: [[EXIT]]: +; GCN-NEXT: [[TMP24:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> [[TMP17]], <16 x i32> +; GCN-NEXT: [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP18]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP24]], <16 x i16> [[TMP25]], <16 x i32> +; GCN-NEXT: [[TMP27:%.*]] = shufflevector <2 x i16> [[TMP19]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP28:%.*]] = shufflevector <16 x i16> [[TMP26]], <16 x i16> [[TMP27]], <16 x i32> +; GCN-NEXT: [[TMP29:%.*]] = shufflevector <2 x i16> [[TMP20]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP28]], <16 x i16> [[TMP29]], <16 x i32> +; GCN-NEXT: [[TMP31:%.*]] = shufflevector <2 x i16> [[TMP21]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP32:%.*]] = shufflevector <16 x i16> [[TMP30]], <16 x i16> [[TMP31]], <16 x i32> +; GCN-NEXT: [[TMP33:%.*]] = shufflevector <2 x i16> [[TMP22]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP47:%.*]] = shufflevector <16 x i16> [[TMP32]], <16 x i16> [[TMP33]], <16 x i32> +; GCN-NEXT: [[TMP48:%.*]] = shufflevector <2 x i16> [[TMP23]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP49:%.*]] = shufflevector <16 x i16> [[TMP47]], <16 x i16> [[TMP48]], <16 x i32> +; GCN-NEXT: [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <16 x i32> +; GCN-NEXT: [[TMP38:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP37]], <16 x i16> [[TMP38]], <16 x i32> +; GCN-NEXT: [[TMP40:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP41:%.*]] = shufflevector <16 x i16> [[TMP39]], <16 x i16> [[TMP40]], <16 x i32> +; GCN-NEXT: [[TMP57:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP43:%.*]] = shufflevector <16 x i16> [[TMP41]], <16 x i16> [[TMP57]], <16 x i32> +; GCN-NEXT: [[TMP44:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[TMP43]], <16 x i16> [[TMP44]], <16 x i32> +; GCN-NEXT: [[TMP46:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP58:%.*]] = shufflevector <16 x i16> [[TMP45]], <16 x i16> [[TMP46]], <16 x i32> +; GCN-NEXT: [[TMP60:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC2157:%.*]] = shufflevector <16 x i16> [[TMP58]], <16 x i16> [[TMP60]], <16 x i32> +; GCN-NEXT: [[TMP50:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP51:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC231:%.*]] = shufflevector <16 x i16> [[TMP50]], <16 x i16> [[TMP51]], <16 x i32> +; GCN-NEXT: [[TMP52:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC252:%.*]] = shufflevector <16 x i16> [[VEC231]], <16 x i16> [[TMP52]], <16 x i32> +; GCN-NEXT: [[TMP53:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC273:%.*]] = shufflevector <16 x i16> [[VEC252]], <16 x i16> [[TMP53]], <16 x i32> +; GCN-NEXT: [[TMP54:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC294:%.*]] = shufflevector <16 x i16> [[VEC273]], <16 x i16> [[TMP54]], <16 x i32> +; GCN-NEXT: [[TMP55:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC2115:%.*]] = shufflevector <16 x i16> [[VEC294]], <16 x i16> [[TMP55]], <16 x i32> +; GCN-NEXT: [[TMP56:%.*]] = shufflevector <2 x i16> [[TMP14]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC2136:%.*]] = shufflevector <16 x i16> [[VEC2115]], <16 x i16> [[TMP56]], <16 x i32> +; GCN-NEXT: [[TMP59:%.*]] = shufflevector <2 x i16> [[TMP42]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[VEC2151:%.*]] = shufflevector <16 x i16> [[VEC2136]], <16 x i16> [[TMP59]], <16 x i32> +; GCN-NEXT: store <16 x i16> [[VEC2157]], ptr [[OUT]], align 32 +; GCN-NEXT: store <16 x i16> [[TMP49]], ptr [[OUT1]], align 32 +; GCN-NEXT: store <16 x i16> [[VEC2151]], ptr [[OUT2]], align 32 +; GCN-NEXT: ret void +; +entry: + %ele0 = load i16, ptr addrspace(3) %inptr0, align 8 + %gep1 = getelementptr i16, ptr addrspace(3) %inptr0, i32 1 + %ele1 = load i16, ptr addrspace(3) %gep1, align 1 + %gep2 = getelementptr i16, ptr addrspace(3) %inptr0, i32 2 + %ele2 = load i16, ptr addrspace(3) %gep2, align 2 + %gep3 = getelementptr i16, ptr addrspace(3) %inptr0, i32 3 + %ele3 = load i16, ptr addrspace(3) %gep3, align 1 + %gep4 = getelementptr i16, ptr addrspace(3) %inptr0, i32 4 + %ele4 = load i16, ptr addrspace(3) %gep4, align 8 + %gep5 = getelementptr i16, ptr addrspace(3) %inptr0, i32 5 + %ele5 = load i16, ptr addrspace(3) %gep5, align 1 + %gep6 = getelementptr i16, ptr addrspace(3) %inptr0, i32 6 + %ele6 = load i16, ptr addrspace(3) %gep6, align 2 + %gep7 = getelementptr i16, ptr addrspace(3) %inptr0, i32 7 + %ele7 = load i16, ptr addrspace(3) %gep7, align 1 + %gep8 = getelementptr i16, ptr addrspace(3) %inptr0, i32 8 + %ele8 = load i16, ptr addrspace(3) %gep8, align 8 + %gep9 = getelementptr i16, ptr addrspace(3) %inptr0, i32 9 + %ele9 = load i16, ptr addrspace(3) %gep9, align 1 + %gep10 = getelementptr i16, ptr addrspace(3) %inptr0, i32 10 + %ele10 = load i16, ptr addrspace(3) %gep10, align 2 + %gep11 = getelementptr i16, ptr addrspace(3) %inptr0, i32 11 + %ele11 = load i16, ptr addrspace(3) %gep11, align 1 + %gep12 = getelementptr i16, ptr addrspace(3) %inptr0, i32 12 + %ele12 = load i16, ptr addrspace(3) %gep12, align 8 + %gep13 = getelementptr i16, ptr addrspace(3) %inptr0, i32 13 + %ele13 = load i16, ptr addrspace(3) %gep13, align 1 + %gep14 = getelementptr i16, ptr addrspace(3) %inptr0, i32 14 + %ele14 = load i16, ptr addrspace(3) %gep14, align 2 + %gep15 = getelementptr i16, ptr addrspace(3) %inptr0, i32 15 + %ele15 = load i16, ptr addrspace(3) %gep15, align 1 + br label %do.body + +do.body: + %phi0 = phi i16 [ %ele0, %entry ], [ %otherele0, %do.body ] + %phi1 = phi i16 [ %ele1, %entry ], [ %otherele1, %do.body ] + %phi2 = phi i16 [ %ele2, %entry ], [ %otherele2, %do.body ] + %phi3 = phi i16 [ %ele3, %entry ], [ %otherele3, %do.body ] + %phi4 = phi i16 [ %ele4, %entry ], [ %otherele4, %do.body ] + %phi5 = phi i16 [ %ele5, %entry ], [ %otherele5, %do.body ] + %phi6 = phi i16 [ %ele6, %entry ], [ %otherele6, %do.body ] + %phi7 = phi i16 [ %ele7, %entry ], [ %otherele7, %do.body ] + %phi8 = phi i16 [ %ele8, %entry ], [ %otherele8, %do.body ] + %phi9 = phi i16 [ %ele9, %entry ], [ %otherele9, %do.body ] + %phi10 = phi i16 [ %ele10, %entry ], [ %otherele10, %do.body ] + %phi11 = phi i16 [ %ele11, %entry ], [ %otherele11, %do.body ] + %phi12 = phi i16 [ %ele12, %entry ], [ %otherele12, %do.body ] + %phi13 = phi i16 [ %ele13, %entry ], [ %otherele13, %do.body ] + %phi14 = phi i16 [ %ele14, %entry ], [ %otherele14, %do.body ] + %phi15 = phi i16 [ %ele15, %entry ], [ %otherele15, %do.body ] + + %otherele0 = load i16, ptr addrspace(3) %inptr0, align 8 + %otherele1 = load i16, ptr addrspace(3) %gep1, align 1 + %otherele2 = load i16, ptr addrspace(3) %gep2, align 2 + %otherele3 = load i16, ptr addrspace(3) %gep3, align 1 + %otherele4 = load i16, ptr addrspace(3) %gep4, align 8 + %otherele5 = load i16, ptr addrspace(3) %gep5, align 1 + %otherele6 = load i16, ptr addrspace(3) %gep6, align 2 + %otherele7 = load i16, ptr addrspace(3) %gep7, align 1 + %otherele8 = load i16, ptr addrspace(3) %gep8, align 8 + %otherele9 = load i16, ptr addrspace(3) %gep9, align 1 + %otherele10 = load i16, ptr addrspace(3) %gep10, align 2 + %otherele11 = load i16, ptr addrspace(3) %gep11, align 1 + %otherele12 = load i16, ptr addrspace(3) %gep12, align 8 + %otherele13 = load i16, ptr addrspace(3) %gep13, align 1 + %otherele14 = load i16, ptr addrspace(3) %gep14, align 2 + %otherele15 = load i16, ptr addrspace(3) %gep15, align 1 + %cmp = icmp eq i32 %flag, 0 + br i1 %cmp, label %exit, label %do.body + +exit: + %vec00 = insertelement <16 x i16> poison, i16 %otherele0, i64 0 + %vec01 = insertelement <16 x i16> %vec00, i16 %otherele1, i64 1 + %vec02 = insertelement <16 x i16> %vec01, i16 %otherele2, i64 2 + %vec03 = insertelement <16 x i16> %vec02, i16 %otherele3, i64 3 + %vec04 = insertelement <16 x i16> %vec03, i16 %otherele4, i64 4 + %vec05 = insertelement <16 x i16> %vec04, i16 %otherele5, i64 5 + %vec06 = insertelement <16 x i16> %vec05, i16 %otherele6, i64 6 + %vec07 = insertelement <16 x i16> %vec06, i16 %otherele7, i64 7 + %vec08 = insertelement <16 x i16> %vec07, i16 %otherele8, i64 8 + %vec09 = insertelement <16 x i16> %vec08, i16 %otherele9, i64 9 + %vec010 = insertelement <16 x i16> %vec09, i16 %otherele10, i64 10 + %vec011 = insertelement <16 x i16> %vec010, i16 %otherele11, i64 11 + %vec012 = insertelement <16 x i16> %vec011, i16 %otherele12, i64 12 + %vec013 = insertelement <16 x i16> %vec012, i16 %otherele13, i64 13 + %vec014 = insertelement <16 x i16> %vec013, i16 %otherele14, i64 14 + %vec015 = insertelement <16 x i16> %vec014, i16 %otherele15, i64 15 + + %vec10 = insertelement <16 x i16> poison, i16 %ele0, i64 0 + %vec11 = insertelement <16 x i16> %vec10, i16 %ele1, i64 1 + %vec12 = insertelement <16 x i16> %vec11, i16 %ele2, i64 2 + %vec13 = insertelement <16 x i16> %vec12, i16 %ele3, i64 3 + %vec14 = insertelement <16 x i16> %vec13, i16 %ele4, i64 4 + %vec15 = insertelement <16 x i16> %vec14, i16 %ele5, i64 5 + %vec16 = insertelement <16 x i16> %vec15, i16 %ele6, i64 6 + %vec17 = insertelement <16 x i16> %vec16, i16 %ele7, i64 7 + %vec18 = insertelement <16 x i16> %vec17, i16 %ele8, i64 8 + %vec19 = insertelement <16 x i16> %vec18, i16 %ele9, i64 9 + %vec110 = insertelement <16 x i16> %vec19, i16 %ele10, i64 10 + %vec111 = insertelement <16 x i16> %vec110, i16 %ele11, i64 11 + %vec112 = insertelement <16 x i16> %vec111, i16 %ele12, i64 12 + %vec113 = insertelement <16 x i16> %vec112, i16 %ele13, i64 13 + %vec114 = insertelement <16 x i16> %vec113, i16 %ele14, i64 14 + %vec115 = insertelement <16 x i16> %vec114, i16 %ele15, i64 15 + + %vec20 = insertelement <16 x i16> poison, i16 %phi0, i64 0 + %vec21 = insertelement <16 x i16> %vec20, i16 %phi1, i64 1 + %vec22 = insertelement <16 x i16> %vec21, i16 %phi2, i64 2 + %vec23 = insertelement <16 x i16> %vec22, i16 %phi3, i64 3 + %vec24 = insertelement <16 x i16> %vec23, i16 %phi4, i64 4 + %vec25 = insertelement <16 x i16> %vec24, i16 %phi5, i64 5 + %vec26 = insertelement <16 x i16> %vec25, i16 %phi6, i64 6 + %vec27 = insertelement <16 x i16> %vec26, i16 %phi7, i64 7 + %vec28 = insertelement <16 x i16> %vec27, i16 %phi8, i64 8 + %vec29 = insertelement <16 x i16> %vec28, i16 %phi9, i64 9 + %vec210 = insertelement <16 x i16> %vec29, i16 %phi10, i64 10 + %vec211 = insertelement <16 x i16> %vec210, i16 %phi11, i64 11 + %vec212 = insertelement <16 x i16> %vec211, i16 %phi12, i64 12 + %vec213 = insertelement <16 x i16> %vec212, i16 %phi13, i64 13 + %vec214 = insertelement <16 x i16> %vec213, i16 %phi14, i64 14 + %vec215 = insertelement <16 x i16> %vec214, i16 %phi15, i64 15 + + store <16 x i16> %vec115, ptr %out + store <16 x i16> %vec015, ptr %out1 + store <16 x i16> %vec215, ptr %out2 + + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll new file mode 100644 index 0000000000000..c585a7f08ad0c --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=GFX9 + +define protected amdgpu_kernel void @myfun(i32 %in, ptr addrspace(1) %aptr1, ptr addrspace(1) %bptr1, ptr addrspace(1) %aptr2, ptr addrspace(1) %bptr2) { +; GFX9-LABEL: define protected amdgpu_kernel void @myfun( +; GFX9-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[APTR1:%.*]], ptr addrspace(1) [[BPTR1:%.*]], ptr addrspace(1) [[APTR2:%.*]], ptr addrspace(1) [[BPTR2:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX9-NEXT: [[ENTRY:.*]]: +; GFX9-NEXT: [[VEC1:%.*]] = load <8 x i16>, ptr addrspace(1) [[APTR1]], align 16 +; GFX9-NEXT: [[BVEC1:%.*]] = load <8 x i16>, ptr addrspace(1) [[BPTR1]], align 16 +; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: br label %[[DO_BODY:.*]] +; GFX9: [[DO_BODY]]: +; GFX9-NEXT: [[ADD:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEWADD:%.*]], %[[DO_BODY]] ] +; GFX9-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP30:%.*]], %[[DO_BODY]] ] +; GFX9-NEXT: [[TMP9:%.*]] = phi <2 x i16> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP31:%.*]], %[[DO_BODY]] ] +; GFX9-NEXT: [[TMP10:%.*]] = phi <2 x i16> [ [[TMP2]], %[[ENTRY]] ], [ [[TMP32:%.*]], %[[DO_BODY]] ] +; GFX9-NEXT: [[TMP11:%.*]] = phi <2 x i16> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP33:%.*]], %[[DO_BODY]] ] +; GFX9-NEXT: [[TMP12:%.*]] = phi <2 x i16> [ [[TMP4]], %[[ENTRY]] ], [ [[TMP34:%.*]], %[[DO_BODY]] ] +; GFX9-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP35:%.*]], %[[DO_BODY]] ] +; GFX9-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP6]], %[[ENTRY]] ], [ [[TMP36:%.*]], %[[DO_BODY]] ] +; GFX9-NEXT: [[TMP15:%.*]] = phi <2 x i16> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP37:%.*]], %[[DO_BODY]] ] +; GFX9-NEXT: [[TMP16:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <8 x i32> +; GFX9-NEXT: [[TMP17:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> poison, <8 x i32> +; GFX9-NEXT: [[TMP18:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> [[TMP9]], <8 x i32> +; GFX9-NEXT: [[TMP19:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <8 x i32> +; GFX9-NEXT: [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP18]], <8 x i16> [[TMP19]], <8 x i32> +; GFX9-NEXT: [[TMP21:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> poison, <8 x i32> +; GFX9-NEXT: [[TMP22:%.*]] = shufflevector <8 x i16> [[TMP20]], <8 x i16> [[TMP21]], <8 x i32> +; GFX9-NEXT: [[TMP23:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> poison, <8 x i32> +; GFX9-NEXT: [[TMP24:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> poison, <8 x i32> +; GFX9-NEXT: [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> [[TMP13]], <8 x i32> +; GFX9-NEXT: [[TMP26:%.*]] = shufflevector <2 x i16> [[TMP14]], <2 x i16> poison, <8 x i32> +; GFX9-NEXT: [[TMP27:%.*]] = shufflevector <8 x i16> [[TMP25]], <8 x i16> [[TMP26]], <8 x i32> +; GFX9-NEXT: [[TMP28:%.*]] = shufflevector <2 x i16> [[TMP15]], <2 x i16> poison, <8 x i32> +; GFX9-NEXT: [[TMP29:%.*]] = shufflevector <8 x i16> [[TMP27]], <8 x i16> [[TMP28]], <8 x i32> +; GFX9-NEXT: [[RES:%.*]] = add <8 x i16> [[TMP22]], [[TMP29]] +; GFX9-NEXT: [[VEC2:%.*]] = load <8 x i16>, ptr addrspace(1) [[APTR2]], align 16 +; GFX9-NEXT: [[BVEC2:%.*]] = load <8 x i16>, ptr addrspace(1) [[BPTR2]], align 16 +; GFX9-NEXT: [[NEWADD]] = add i32 [[ADD]], 1 +; GFX9-NEXT: [[COND:%.*]] = icmp sgt i32 [[NEWADD]], [[IN]] +; GFX9-NEXT: [[TMP30]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP31]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP32]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP33]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP34]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP35]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP36]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: [[TMP37]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> +; GFX9-NEXT: br i1 [[COND]], label %[[DO_BODY]], label %[[END:.*]] +; GFX9: [[END]]: +; GFX9-NEXT: ret void +; +entry: + %vec1 = load <8 x i16>, ptr addrspace(1) %aptr1 + %el0 = extractelement <8 x i16> %vec1, i64 0 + %el1 = extractelement <8 x i16> %vec1, i64 1 + %el2 = extractelement <8 x i16> %vec1, i64 2 + %el3 = extractelement <8 x i16> %vec1, i64 3 + %el4 = extractelement <8 x i16> %vec1, i64 4 + %el5 = extractelement <8 x i16> %vec1, i64 5 + %el6 = extractelement <8 x i16> %vec1, i64 6 + %el7 = extractelement <8 x i16> %vec1, i64 7 + %bvec1 = load <8 x i16>, ptr addrspace(1) %bptr1 + %bel0 = extractelement <8 x i16> %bvec1, i64 0 + %bel1 = extractelement <8 x i16> %bvec1, i64 1 + %bel2 = extractelement <8 x i16> %bvec1, i64 2 + %bel3 = extractelement <8 x i16> %bvec1, i64 3 + %bel4 = extractelement <8 x i16> %bvec1, i64 4 + %bel5 = extractelement <8 x i16> %bvec1, i64 5 + %bel6 = extractelement <8 x i16> %bvec1, i64 6 + %bel7 = extractelement <8 x i16> %bvec1, i64 7 + br label %do.body + +do.body: + %a_thread_buf1 = phi i16 [%el1, %entry], [%newel1, %do.body] + %a_thread_buf2 = phi i16 [%el2, %entry], [%newel2, %do.body] + %a_thread_buf3 = phi i16 [%el3, %entry], [%newel3, %do.body] + %a_thread_buf4 = phi i16 [%el4, %entry], [%newel4, %do.body] + %a_thread_buf5 = phi i16 [%el5, %entry], [%newel5, %do.body] + %a_thread_buf6 = phi i16 [%el6, %entry], [%newel6, %do.body] + %a_thread_buf7 = phi i16 [%el7, %entry], [%newel7, %do.body] + %b_thread_buf1 = phi i16 [%bel1, %entry], [%bnewel1, %do.body] + %b_thread_buf2 = phi i16 [%bel2, %entry], [%bnewel2, %do.body] + %b_thread_buf3 = phi i16 [%bel3, %entry], [%bnewel3, %do.body] + %b_thread_buf4 = phi i16 [%bel4, %entry], [%bnewel4, %do.body] + %b_thread_buf5 = phi i16 [%bel5, %entry], [%bnewel5, %do.body] + %b_thread_buf6 = phi i16 [%bel6, %entry], [%bnewel6, %do.body] + %b_thread_buf7 = phi i16 [%bel7, %entry], [%bnewel7, %do.body] + %add = phi i32 [0, %entry], [%newadd, %do.body] + %a_thread_buf0 = phi i16 [%el0, %entry], [%newel0, %do.body] + %b_thread_buf0 = phi i16 [%bel0, %entry], [%bnewel0, %do.body] + %a_thread_vec0 = insertelement <8 x i16> poison, i16 %a_thread_buf0, i64 0 + %a_thread_vec1 = insertelement <8 x i16> %a_thread_vec0, i16 %a_thread_buf1, i64 1 + %a_thread_vec2 = insertelement <8 x i16> %a_thread_vec1, i16 %a_thread_buf2, i64 2 + %a_thread_vec3 = insertelement <8 x i16> %a_thread_vec2, i16 %a_thread_buf3, i64 3 + %a_thread_vec4 = insertelement <8 x i16> %a_thread_vec3, i16 %a_thread_buf4, i64 4 + %a_thread_vec5 = insertelement <8 x i16> %a_thread_vec4, i16 %a_thread_buf5, i64 5 + %a_thread_vec6 = insertelement <8 x i16> %a_thread_vec5, i16 %a_thread_buf6, i64 6 + %a_thread_vec7 = insertelement <8 x i16> %a_thread_vec6, i16 %a_thread_buf7, i64 7 + %b_thread_vec0 = insertelement <8 x i16> poison, i16 %b_thread_buf0, i64 0 + %b_thread_vec1 = insertelement <8 x i16> %b_thread_vec0, i16 %b_thread_buf1, i64 1 + %b_thread_vec2 = insertelement <8 x i16> %b_thread_vec1, i16 %b_thread_buf2, i64 2 + %b_thread_vec3 = insertelement <8 x i16> %b_thread_vec2, i16 %b_thread_buf3, i64 3 + %b_thread_vec4 = insertelement <8 x i16> %b_thread_vec3, i16 %b_thread_buf4, i64 4 + %b_thread_vec5 = insertelement <8 x i16> %b_thread_vec4, i16 %b_thread_buf5, i64 5 + %b_thread_vec6 = insertelement <8 x i16> %b_thread_vec5, i16 %b_thread_buf6, i64 6 + %b_thread_vec7 = insertelement <8 x i16> %b_thread_vec6, i16 %b_thread_buf7, i64 7 + %res = add <8 x i16> %a_thread_vec7, %b_thread_vec7 + %vec2 = load <8 x i16>, ptr addrspace(1) %aptr2 + %newel0 = extractelement <8 x i16> %vec2, i64 0 + %newel1 = extractelement <8 x i16> %vec2, i64 1 + %newel2 = extractelement <8 x i16> %vec2, i64 2 + %newel3 = extractelement <8 x i16> %vec2, i64 3 + %newel4 = extractelement <8 x i16> %vec2, i64 4 + %newel5 = extractelement <8 x i16> %vec2, i64 5 + %newel6 = extractelement <8 x i16> %vec2, i64 6 + %newel7 = extractelement <8 x i16> %vec2, i64 7 + %bvec2 = load <8 x i16>, ptr addrspace(1) %bptr2 + %bnewel0 = extractelement <8 x i16> %bvec2, i64 0 + %bnewel1 = extractelement <8 x i16> %bvec2, i64 1 + %bnewel2 = extractelement <8 x i16> %bvec2, i64 2 + %bnewel3 = extractelement <8 x i16> %bvec2, i64 3 + %bnewel4 = extractelement <8 x i16> %bvec2, i64 4 + %bnewel5 = extractelement <8 x i16> %bvec2, i64 5 + %bnewel6 = extractelement <8 x i16> %bvec2, i64 6 + %bnewel7 = extractelement <8 x i16> %bvec2, i64 7 + %newadd = add i32 %add, 1 + %cond = icmp sgt i32 %newadd, %in + br i1 %cond, label %do.body, label %end + +end: + ret void +} + + diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll new file mode 100644 index 0000000000000..d68e273b56ca7 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll @@ -0,0 +1,569 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX7 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX8PLUS,GFX8 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX8PLUS,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer -amdgpu-coerce-illegal-types=1 %s | FileCheck -check-prefixes=VECI8 %s + +define protected amdgpu_kernel void @phi(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr %out1, i32 %flag) { +; GCN-LABEL: @vectorizePHI( +; GCN-NEXT: entry: +; GCN-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; GCN-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GCN-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 +; GCN-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GCN-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 +; GCN-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GCN-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 +; GCN-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GCN-NEXT: br label [[DO_BODY:%.*]] +; GCN: do.body: +; GCN-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[OTHERELE3:%.*]], [[DO_BODY]] ] +; GCN-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[OTHERELE2:%.*]], [[DO_BODY]] ] +; GCN-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[OTHERELE1:%.*]], [[DO_BODY]] ] +; GCN-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[OTHERELE0:%.*]], [[DO_BODY]] ] +; GCN-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GCN-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GCN-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GCN-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GCN-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8 +; GCN-NEXT: [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9 +; GCN-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10 +; GCN-NEXT: [[VEC03:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11 +; GCN-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 +; GCN-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GCN-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 +; GCN-NEXT: [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GCN-NEXT: store <16 x i8> [[VEC13]], ptr addrspace(3) [[INPTR1:%.*]], align 2 +; GCN-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; GCN-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[DO_BODY]] +; GCN: exit: +; GCN-NEXT: store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 16 +; GCN-NEXT: store <16 x i8> [[VEC03]], ptr [[OUT1:%.*]], align 16 +; GCN-NEXT: ret void +; +; GFX7-LABEL: @phi( +; GFX7-NEXT: entry: +; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; GFX7-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 +; GFX7-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX7-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 +; GFX7-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GFX7-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 +; GFX7-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX7-NEXT: br label [[DO_BODY:%.*]] +; GFX7: do.body: +; GFX7-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[OTHERELE3:%.*]], [[DO_BODY]] ] +; GFX7-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[OTHERELE2:%.*]], [[DO_BODY]] ] +; GFX7-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[OTHERELE1:%.*]], [[DO_BODY]] ] +; GFX7-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[OTHERELE0:%.*]], [[DO_BODY]] ] +; GFX7-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GFX7-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX7-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GFX7-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX7-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8 +; GFX7-NEXT: [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9 +; GFX7-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10 +; GFX7-NEXT: [[VEC03:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11 +; GFX7-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 +; GFX7-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GFX7-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 +; GFX7-NEXT: [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GFX7-NEXT: store <16 x i8> [[VEC13]], ptr addrspace(3) [[INPTR1:%.*]], align 2 +; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; GFX7-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[DO_BODY]] +; GFX7: exit: +; GFX7-NEXT: store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 16 +; GFX7-NEXT: store <16 x i8> [[VEC03]], ptr [[OUT1:%.*]], align 16 +; GFX7-NEXT: ret void +; +; GFX8PLUS-LABEL: @phi( +; GFX8PLUS-NEXT: entry: +; GFX8PLUS-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; GFX8PLUS-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GFX8PLUS-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 +; GFX8PLUS-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX8PLUS-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 +; GFX8PLUS-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GFX8PLUS-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 +; GFX8PLUS-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX8PLUS-NEXT: br label [[DO_BODY:%.*]] +; GFX8PLUS: do.body: +; GFX8PLUS-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[OTHERELE3:%.*]], [[DO_BODY]] ] +; GFX8PLUS-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[OTHERELE2:%.*]], [[DO_BODY]] ] +; GFX8PLUS-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[OTHERELE1:%.*]], [[DO_BODY]] ] +; GFX8PLUS-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[OTHERELE0:%.*]], [[DO_BODY]] ] +; GFX8PLUS-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GFX8PLUS-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX8PLUS-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GFX8PLUS-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX8PLUS-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8 +; GFX8PLUS-NEXT: [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9 +; GFX8PLUS-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10 +; GFX8PLUS-NEXT: [[VEC03:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11 +; GFX8PLUS-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 +; GFX8PLUS-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GFX8PLUS-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 +; GFX8PLUS-NEXT: [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GFX8PLUS-NEXT: store <16 x i8> [[VEC13]], ptr addrspace(3) [[INPTR1:%.*]], align 2 +; GFX8PLUS-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; GFX8PLUS-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[DO_BODY]] +; GFX8PLUS: exit: +; GFX8PLUS-NEXT: store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 16 +; GFX8PLUS-NEXT: store <16 x i8> [[VEC03]], ptr [[OUT1:%.*]], align 16 +; GFX8PLUS-NEXT: ret void +; +; VECI8-LABEL: @phi( +; VECI8-NEXT: entry: +; VECI8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; VECI8-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; VECI8-NEXT: br label [[DO_BODY:%.*]] +; VECI8: do.body: +; VECI8-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2:%.*]], [[DO_BODY]] ] +; VECI8-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; VECI8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: store <16 x i8> [[TMP4]], ptr addrspace(3) [[INPTR1:%.*]], align 2 +; VECI8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; VECI8-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[DO_BODY]] +; VECI8: exit: +; VECI8-NEXT: store <16 x i8> [[TMP4]], ptr [[OUT:%.*]], align 16 +; VECI8-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT1:%.*]], align 16 +; VECI8-NEXT: ret void +; +entry: + %gep0 = getelementptr i8, ptr addrspace(3) %inptr0, i32 0 + %ele0 = load i8, ptr addrspace(3) %gep0, align 8 + %gep1 = getelementptr i8, ptr addrspace(3) %inptr0, i32 1 + %ele1 = load i8, ptr addrspace(3) %gep1, align 1 + %gep2 = getelementptr i8, ptr addrspace(3) %inptr0, i32 2 + %ele2 = load i8, ptr addrspace(3) %gep2, align 2 + %gep3 = getelementptr i8, ptr addrspace(3) %inptr0, i32 3 + %ele3 = load i8, ptr addrspace(3) %gep3, align 1 + br label %do.body + +do.body: + %phi0 = phi i8 [ %ele3, %entry ], [ %otherele3, %do.body ] + %phi1 = phi i8 [ %ele2, %entry ], [ %otherele2, %do.body ] + %phi2 = phi i8 [ %ele1, %entry ], [ %otherele1, %do.body ] + %phi3 = phi i8 [ %ele0, %entry ], [ %otherele0, %do.body ] + %otherele0 = load i8, ptr addrspace(3) %gep0, align 8 + %otherele1 = load i8, ptr addrspace(3) %gep1, align 1 + %otherele2 = load i8, ptr addrspace(3) %gep2, align 2 + %otherele3 = load i8, ptr addrspace(3) %gep3, align 1 + %vec00 = insertelement <16 x i8> poison, i8 %otherele0, i64 8 + %vec01 = insertelement <16 x i8> %vec00, i8 %otherele1, i64 9 + %vec02 = insertelement <16 x i8> %vec01, i8 %otherele2, i64 10 + %vec03 = insertelement <16 x i8> %vec02, i8 %otherele3, i64 11 + %vec10 = insertelement <16 x i8> poison, i8 %phi3, i64 8 + %vec11 = insertelement <16 x i8> %vec10, i8 %phi2, i64 9 + %vec12 = insertelement <16 x i8> %vec11, i8 %phi1, i64 10 + %vec13 = insertelement <16 x i8> %vec12, i8 %phi0, i64 11 + store <16 x i8> %vec13, ptr addrspace(3) %inptr1, align 2 + %cmp = icmp eq i32 %flag, 0 + br i1 %cmp, label %exit, label %do.body + +exit: + store <16 x i8> %vec13, ptr %out + store <16 x i8> %vec03, ptr %out1 + ret void +} + + +define protected amdgpu_kernel void @arith_phi(ptr addrspace(3) %inptr0, ptr %out, i32 %flag) { +; GCN-LABEL: @vectorizePHI2( +; GCN-NEXT: entry: +; GCN-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; GCN-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GCN-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 +; GCN-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GCN-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 +; GCN-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GCN-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 +; GCN-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GCN-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; GCN-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[BB_1:%.*]] +; GCN: bb.1: +; GCN-NEXT: [[ADD0:%.*]] = add i8 [[ELE0]], 1 +; GCN-NEXT: [[ADD1:%.*]] = add i8 [[ELE1]], 1 +; GCN-NEXT: [[ADD2:%.*]] = add i8 [[ELE2]], 1 +; GCN-NEXT: [[ADD3:%.*]] = add i8 [[ELE3]], 1 +; GCN-NEXT: br label [[EXIT]] +; GCN: exit: +; GCN-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[ADD0]], [[BB_1]] ] +; GCN-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[ADD1]], [[BB_1]] ] +; GCN-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[ADD2]], [[BB_1]] ] +; GCN-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[ADD3]], [[BB_1]] ] +; GCN-NEXT: [[OTHERELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GCN-NEXT: [[OTHERELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GCN-NEXT: [[OTHERELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GCN-NEXT: [[OTHERELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GCN-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 +; GCN-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GCN-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 +; GCN-NEXT: [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GCN-NEXT: store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; +; GFX7-LABEL: @arith_phi( +; GFX7-NEXT: entry: +; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; GFX7-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 +; GFX7-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX7-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 +; GFX7-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GFX7-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 +; GFX7-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; GFX7-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[BB_1:%.*]] +; GFX7: bb.1: +; GFX7-NEXT: [[ADD0:%.*]] = add i8 [[ELE0]], 1 +; GFX7-NEXT: [[ADD1:%.*]] = add i8 [[ELE1]], 1 +; GFX7-NEXT: [[ADD2:%.*]] = add i8 [[ELE2]], 1 +; GFX7-NEXT: [[ADD3:%.*]] = add i8 [[ELE3]], 1 +; GFX7-NEXT: br label [[EXIT]] +; GFX7: exit: +; GFX7-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[ADD0]], [[BB_1]] ] +; GFX7-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[ADD1]], [[BB_1]] ] +; GFX7-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[ADD2]], [[BB_1]] ] +; GFX7-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[ADD3]], [[BB_1]] ] +; GFX7-NEXT: [[OTHERELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GFX7-NEXT: [[OTHERELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX7-NEXT: [[OTHERELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GFX7-NEXT: [[OTHERELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX7-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 +; GFX7-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GFX7-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 +; GFX7-NEXT: [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GFX7-NEXT: store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 2 +; GFX7-NEXT: ret void +; +; GFX8PLUS-LABEL: @arith_phi( +; GFX8PLUS-NEXT: entry: +; GFX8PLUS-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; GFX8PLUS-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GFX8PLUS-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 +; GFX8PLUS-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX8PLUS-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 +; GFX8PLUS-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GFX8PLUS-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 +; GFX8PLUS-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX8PLUS-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; GFX8PLUS-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[BB_1:%.*]] +; GFX8PLUS: bb.1: +; GFX8PLUS-NEXT: [[ADD0:%.*]] = add i8 [[ELE0]], 1 +; GFX8PLUS-NEXT: [[ADD1:%.*]] = add i8 [[ELE1]], 1 +; GFX8PLUS-NEXT: [[ADD2:%.*]] = add i8 [[ELE2]], 1 +; GFX8PLUS-NEXT: [[ADD3:%.*]] = add i8 [[ELE3]], 1 +; GFX8PLUS-NEXT: br label [[EXIT]] +; GFX8PLUS: exit: +; GFX8PLUS-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[ADD0]], [[BB_1]] ] +; GFX8PLUS-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[ADD1]], [[BB_1]] ] +; GFX8PLUS-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[ADD2]], [[BB_1]] ] +; GFX8PLUS-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[ADD3]], [[BB_1]] ] +; GFX8PLUS-NEXT: [[OTHERELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GFX8PLUS-NEXT: [[OTHERELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX8PLUS-NEXT: [[OTHERELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GFX8PLUS-NEXT: [[OTHERELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX8PLUS-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 +; GFX8PLUS-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GFX8PLUS-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 +; GFX8PLUS-NEXT: [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GFX8PLUS-NEXT: store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 2 +; GFX8PLUS-NEXT: ret void +; +; VECI8-LABEL: @arith_phi( +; VECI8-NEXT: entry: +; VECI8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; VECI8-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 +; VECI8-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 +; VECI8-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 +; VECI8-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; VECI8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; VECI8-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[BB_1:%.*]] +; VECI8: bb.1: +; VECI8-NEXT: [[TMP1:%.*]] = add <4 x i8> [[TMP0]], splat (i8 1) +; VECI8-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> +; VECI8-NEXT: br label [[EXIT]] +; VECI8: exit: +; VECI8-NEXT: [[TMP3:%.*]] = phi <4 x i8> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB_1]] ] +; VECI8-NEXT: [[OTHERELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; VECI8-NEXT: [[OTHERELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; VECI8-NEXT: [[OTHERELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; VECI8-NEXT: [[OTHERELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; VECI8-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> poison, <16 x i32> +; VECI8-NEXT: store <16 x i8> [[TMP4]], ptr [[OUT:%.*]], align 2 +; VECI8-NEXT: ret void +; +entry: + %gep0 = getelementptr i8, ptr addrspace(3) %inptr0, i32 0 + %ele0 = load i8, ptr addrspace(3) %gep0, align 8 + %gep1 = getelementptr i8, ptr addrspace(3) %inptr0, i32 1 + %ele1 = load i8, ptr addrspace(3) %gep1, align 1 + %gep2 = getelementptr i8, ptr addrspace(3) %inptr0, i32 2 + %ele2 = load i8, ptr addrspace(3) %gep2, align 2 + %gep3 = getelementptr i8, ptr addrspace(3) %inptr0, i32 3 + %ele3 = load i8, ptr addrspace(3) %gep3, align 1 + %cmp = icmp eq i32 %flag, 0 + br i1 %cmp, label %exit, label %bb.1 + +bb.1: + %add0 = add i8 %ele0, 1 + %add1 = add i8 %ele1, 1 + %add2 = add i8 %ele2, 1 + %add3 = add i8 %ele3, 1 + br label %exit + +exit: + %phi0 = phi i8 [ %ele3, %entry ], [ %add0, %bb.1 ] + %phi1 = phi i8 [ %ele2, %entry ], [ %add1, %bb.1 ] + %phi2 = phi i8 [ %ele1, %entry ], [ %add2, %bb.1 ] + %phi3 = phi i8 [ %ele0, %entry ], [ %add3, %bb.1 ] + %otherele0 = load i8, ptr addrspace(3) %gep0, align 8 + %otherele1 = load i8, ptr addrspace(3) %gep1, align 1 + %otherele2 = load i8, ptr addrspace(3) %gep2, align 2 + %otherele3 = load i8, ptr addrspace(3) %gep3, align 1 + %vec10 = insertelement <16 x i8> poison, i8 %phi3, i64 8 + %vec11 = insertelement <16 x i8> %vec10, i8 %phi2, i64 9 + %vec12 = insertelement <16 x i8> %vec11, i8 %phi1, i64 10 + %vec13 = insertelement <16 x i8> %vec12, i8 %phi0, i64 11 + store <16 x i8> %vec13, ptr %out, align 2 + ret void +} + +define protected amdgpu_kernel void @arith(<16 x i8> %invec, ptr %out, i32 %flag) { +; GFX7-LABEL: @arith( +; GFX7-NEXT: entry: +; GFX7-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC:%.*]], i64 0 +; GFX7-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1 +; GFX7-NEXT: [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2 +; GFX7-NEXT: [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3 +; GFX7-NEXT: [[EL4:%.*]] = extractelement <16 x i8> [[INVEC]], i64 4 +; GFX7-NEXT: [[EL5:%.*]] = extractelement <16 x i8> [[INVEC]], i64 5 +; GFX7-NEXT: [[EL6:%.*]] = extractelement <16 x i8> [[INVEC]], i64 6 +; GFX7-NEXT: [[EL7:%.*]] = extractelement <16 x i8> [[INVEC]], i64 7 +; GFX7-NEXT: [[EL8:%.*]] = extractelement <16 x i8> [[INVEC]], i64 8 +; GFX7-NEXT: [[EL9:%.*]] = extractelement <16 x i8> [[INVEC]], i64 9 +; GFX7-NEXT: [[EL10:%.*]] = extractelement <16 x i8> [[INVEC]], i64 10 +; GFX7-NEXT: [[EL11:%.*]] = extractelement <16 x i8> [[INVEC]], i64 11 +; GFX7-NEXT: [[EL12:%.*]] = extractelement <16 x i8> [[INVEC]], i64 12 +; GFX7-NEXT: [[EL13:%.*]] = extractelement <16 x i8> [[INVEC]], i64 13 +; GFX7-NEXT: [[EL14:%.*]] = extractelement <16 x i8> [[INVEC]], i64 14 +; GFX7-NEXT: [[EL15:%.*]] = extractelement <16 x i8> [[INVEC]], i64 15 +; GFX7-NEXT: [[MUL0:%.*]] = mul i8 [[EL0]], 1 +; GFX7-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1 +; GFX7-NEXT: [[MUL2:%.*]] = mul i8 [[EL2]], 1 +; GFX7-NEXT: [[MUL3:%.*]] = mul i8 [[EL3]], 1 +; GFX7-NEXT: [[MUL4:%.*]] = mul i8 [[EL4]], 1 +; GFX7-NEXT: [[MUL5:%.*]] = mul i8 [[EL5]], 1 +; GFX7-NEXT: [[MUL6:%.*]] = mul i8 [[EL6]], 1 +; GFX7-NEXT: [[MUL7:%.*]] = mul i8 [[EL7]], 1 +; GFX7-NEXT: [[MUL8:%.*]] = mul i8 [[EL8]], 1 +; GFX7-NEXT: [[MUL9:%.*]] = mul i8 [[EL9]], 1 +; GFX7-NEXT: [[MUL10:%.*]] = mul i8 [[EL10]], 1 +; GFX7-NEXT: [[MUL11:%.*]] = mul i8 [[EL11]], 1 +; GFX7-NEXT: [[MUL12:%.*]] = mul i8 [[EL12]], 1 +; GFX7-NEXT: [[MUL13:%.*]] = mul i8 [[EL13]], 1 +; GFX7-NEXT: [[MUL14:%.*]] = mul i8 [[EL14]], 1 +; GFX7-NEXT: [[MUL15:%.*]] = mul i8 [[EL15]], 1 +; GFX7-NEXT: [[ADD0:%.*]] = add i8 [[MUL0]], 1 +; GFX7-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1 +; GFX7-NEXT: [[ADD2:%.*]] = add i8 [[MUL2]], 1 +; GFX7-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1 +; GFX7-NEXT: [[ADD4:%.*]] = add i8 [[MUL4]], 1 +; GFX7-NEXT: [[ADD5:%.*]] = add i8 [[MUL5]], 1 +; GFX7-NEXT: [[ADD6:%.*]] = add i8 [[MUL6]], 1 +; GFX7-NEXT: [[ADD7:%.*]] = add i8 [[MUL7]], 1 +; GFX7-NEXT: [[ADD8:%.*]] = add i8 [[MUL8]], 1 +; GFX7-NEXT: [[ADD9:%.*]] = add i8 [[MUL9]], 1 +; GFX7-NEXT: [[ADD10:%.*]] = add i8 [[MUL10]], 1 +; GFX7-NEXT: [[ADD11:%.*]] = add i8 [[MUL11]], 1 +; GFX7-NEXT: [[ADD12:%.*]] = add i8 [[MUL12]], 1 +; GFX7-NEXT: [[ADD13:%.*]] = add i8 [[MUL13]], 1 +; GFX7-NEXT: [[ADD14:%.*]] = add i8 [[MUL14]], 1 +; GFX7-NEXT: [[ADD15:%.*]] = add i8 [[MUL15]], 1 +; GFX7-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0 +; GFX7-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1 +; GFX7-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2 +; GFX7-NEXT: [[VECINS3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3 +; GFX7-NEXT: [[VECINS4:%.*]] = insertelement <16 x i8> [[VECINS3]], i8 [[ADD4]], i64 4 +; GFX7-NEXT: [[VECINS5:%.*]] = insertelement <16 x i8> [[VECINS4]], i8 [[ADD5]], i64 5 +; GFX7-NEXT: [[VECINS6:%.*]] = insertelement <16 x i8> [[VECINS5]], i8 [[ADD6]], i64 6 +; GFX7-NEXT: [[VECINS7:%.*]] = insertelement <16 x i8> [[VECINS6]], i8 [[ADD7]], i64 7 +; GFX7-NEXT: [[VECINS8:%.*]] = insertelement <16 x i8> [[VECINS7]], i8 [[ADD8]], i64 8 +; GFX7-NEXT: [[VECINS9:%.*]] = insertelement <16 x i8> [[VECINS8]], i8 [[ADD9]], i64 9 +; GFX7-NEXT: [[VECINS10:%.*]] = insertelement <16 x i8> [[VECINS9]], i8 [[ADD10]], i64 10 +; GFX7-NEXT: [[VECINS11:%.*]] = insertelement <16 x i8> [[VECINS10]], i8 [[ADD11]], i64 11 +; GFX7-NEXT: [[VECINS12:%.*]] = insertelement <16 x i8> [[VECINS11]], i8 [[ADD12]], i64 12 +; GFX7-NEXT: [[VECINS13:%.*]] = insertelement <16 x i8> [[VECINS12]], i8 [[ADD13]], i64 13 +; GFX7-NEXT: [[VECINS14:%.*]] = insertelement <16 x i8> [[VECINS13]], i8 [[ADD14]], i64 14 +; GFX7-NEXT: [[VECINS15:%.*]] = insertelement <16 x i8> [[VECINS14]], i8 [[ADD15]], i64 15 +; GFX7-NEXT: store <16 x i8> [[VECINS15]], ptr [[OUT:%.*]], align 16 +; GFX7-NEXT: ret void +; +; GFX8PLUS-LABEL: @arith( +; GFX8PLUS-NEXT: entry: +; GFX8PLUS-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC:%.*]], i64 0 +; GFX8PLUS-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1 +; GFX8PLUS-NEXT: [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2 +; GFX8PLUS-NEXT: [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3 +; GFX8PLUS-NEXT: [[EL4:%.*]] = extractelement <16 x i8> [[INVEC]], i64 4 +; GFX8PLUS-NEXT: [[EL5:%.*]] = extractelement <16 x i8> [[INVEC]], i64 5 +; GFX8PLUS-NEXT: [[EL6:%.*]] = extractelement <16 x i8> [[INVEC]], i64 6 +; GFX8PLUS-NEXT: [[EL7:%.*]] = extractelement <16 x i8> [[INVEC]], i64 7 +; GFX8PLUS-NEXT: [[EL8:%.*]] = extractelement <16 x i8> [[INVEC]], i64 8 +; GFX8PLUS-NEXT: [[EL9:%.*]] = extractelement <16 x i8> [[INVEC]], i64 9 +; GFX8PLUS-NEXT: [[EL10:%.*]] = extractelement <16 x i8> [[INVEC]], i64 10 +; GFX8PLUS-NEXT: [[EL11:%.*]] = extractelement <16 x i8> [[INVEC]], i64 11 +; GFX8PLUS-NEXT: [[EL12:%.*]] = extractelement <16 x i8> [[INVEC]], i64 12 +; GFX8PLUS-NEXT: [[EL13:%.*]] = extractelement <16 x i8> [[INVEC]], i64 13 +; GFX8PLUS-NEXT: [[EL14:%.*]] = extractelement <16 x i8> [[INVEC]], i64 14 +; GFX8PLUS-NEXT: [[EL15:%.*]] = extractelement <16 x i8> [[INVEC]], i64 15 +; GFX8PLUS-NEXT: [[MUL0:%.*]] = mul i8 [[EL0]], 1 +; GFX8PLUS-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1 +; GFX8PLUS-NEXT: [[MUL2:%.*]] = mul i8 [[EL2]], 1 +; GFX8PLUS-NEXT: [[MUL3:%.*]] = mul i8 [[EL3]], 1 +; GFX8PLUS-NEXT: [[MUL4:%.*]] = mul i8 [[EL4]], 1 +; GFX8PLUS-NEXT: [[MUL5:%.*]] = mul i8 [[EL5]], 1 +; GFX8PLUS-NEXT: [[MUL6:%.*]] = mul i8 [[EL6]], 1 +; GFX8PLUS-NEXT: [[MUL7:%.*]] = mul i8 [[EL7]], 1 +; GFX8PLUS-NEXT: [[MUL8:%.*]] = mul i8 [[EL8]], 1 +; GFX8PLUS-NEXT: [[MUL9:%.*]] = mul i8 [[EL9]], 1 +; GFX8PLUS-NEXT: [[MUL10:%.*]] = mul i8 [[EL10]], 1 +; GFX8PLUS-NEXT: [[MUL11:%.*]] = mul i8 [[EL11]], 1 +; GFX8PLUS-NEXT: [[MUL12:%.*]] = mul i8 [[EL12]], 1 +; GFX8PLUS-NEXT: [[MUL13:%.*]] = mul i8 [[EL13]], 1 +; GFX8PLUS-NEXT: [[MUL14:%.*]] = mul i8 [[EL14]], 1 +; GFX8PLUS-NEXT: [[MUL15:%.*]] = mul i8 [[EL15]], 1 +; GFX8PLUS-NEXT: [[ADD0:%.*]] = add i8 [[MUL0]], 1 +; GFX8PLUS-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1 +; GFX8PLUS-NEXT: [[ADD2:%.*]] = add i8 [[MUL2]], 1 +; GFX8PLUS-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1 +; GFX8PLUS-NEXT: [[ADD4:%.*]] = add i8 [[MUL4]], 1 +; GFX8PLUS-NEXT: [[ADD5:%.*]] = add i8 [[MUL5]], 1 +; GFX8PLUS-NEXT: [[ADD6:%.*]] = add i8 [[MUL6]], 1 +; GFX8PLUS-NEXT: [[ADD7:%.*]] = add i8 [[MUL7]], 1 +; GFX8PLUS-NEXT: [[ADD8:%.*]] = add i8 [[MUL8]], 1 +; GFX8PLUS-NEXT: [[ADD9:%.*]] = add i8 [[MUL9]], 1 +; GFX8PLUS-NEXT: [[ADD10:%.*]] = add i8 [[MUL10]], 1 +; GFX8PLUS-NEXT: [[ADD11:%.*]] = add i8 [[MUL11]], 1 +; GFX8PLUS-NEXT: [[ADD12:%.*]] = add i8 [[MUL12]], 1 +; GFX8PLUS-NEXT: [[ADD13:%.*]] = add i8 [[MUL13]], 1 +; GFX8PLUS-NEXT: [[ADD14:%.*]] = add i8 [[MUL14]], 1 +; GFX8PLUS-NEXT: [[ADD15:%.*]] = add i8 [[MUL15]], 1 +; GFX8PLUS-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0 +; GFX8PLUS-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1 +; GFX8PLUS-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2 +; GFX8PLUS-NEXT: [[VECINS3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3 +; GFX8PLUS-NEXT: [[VECINS4:%.*]] = insertelement <16 x i8> [[VECINS3]], i8 [[ADD4]], i64 4 +; GFX8PLUS-NEXT: [[VECINS5:%.*]] = insertelement <16 x i8> [[VECINS4]], i8 [[ADD5]], i64 5 +; GFX8PLUS-NEXT: [[VECINS6:%.*]] = insertelement <16 x i8> [[VECINS5]], i8 [[ADD6]], i64 6 +; GFX8PLUS-NEXT: [[VECINS7:%.*]] = insertelement <16 x i8> [[VECINS6]], i8 [[ADD7]], i64 7 +; GFX8PLUS-NEXT: [[VECINS8:%.*]] = insertelement <16 x i8> [[VECINS7]], i8 [[ADD8]], i64 8 +; GFX8PLUS-NEXT: [[VECINS9:%.*]] = insertelement <16 x i8> [[VECINS8]], i8 [[ADD9]], i64 9 +; GFX8PLUS-NEXT: [[VECINS10:%.*]] = insertelement <16 x i8> [[VECINS9]], i8 [[ADD10]], i64 10 +; GFX8PLUS-NEXT: [[VECINS11:%.*]] = insertelement <16 x i8> [[VECINS10]], i8 [[ADD11]], i64 11 +; GFX8PLUS-NEXT: [[VECINS12:%.*]] = insertelement <16 x i8> [[VECINS11]], i8 [[ADD12]], i64 12 +; GFX8PLUS-NEXT: [[VECINS13:%.*]] = insertelement <16 x i8> [[VECINS12]], i8 [[ADD13]], i64 13 +; GFX8PLUS-NEXT: [[VECINS14:%.*]] = insertelement <16 x i8> [[VECINS13]], i8 [[ADD14]], i64 14 +; GFX8PLUS-NEXT: [[VECINS15:%.*]] = insertelement <16 x i8> [[VECINS14]], i8 [[ADD15]], i64 15 +; GFX8PLUS-NEXT: store <16 x i8> [[VECINS15]], ptr [[OUT:%.*]], align 16 +; GFX8PLUS-NEXT: ret void +; +; VECI8-LABEL: @arith( +; VECI8-NEXT: entry: +; VECI8-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> poison, <4 x i32> +; VECI8-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1) +; VECI8-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1) +; VECI8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; VECI8-NEXT: [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], splat (i8 1) +; VECI8-NEXT: [[TMP5:%.*]] = add <4 x i8> [[TMP4]], splat (i8 1) +; VECI8-NEXT: [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; VECI8-NEXT: [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], splat (i8 1) +; VECI8-NEXT: [[TMP8:%.*]] = add <4 x i8> [[TMP7]], splat (i8 1) +; VECI8-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; VECI8-NEXT: [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], splat (i8 1) +; VECI8-NEXT: [[TMP11:%.*]] = add <4 x i8> [[TMP10]], splat (i8 1) +; VECI8-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> +; VECI8-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> +; VECI8-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> +; VECI8-NEXT: store <16 x i8> [[VECINS153]], ptr [[OUT:%.*]], align 16 +; VECI8-NEXT: ret void +; +entry: + %el0 = extractelement <16 x i8> %invec, i64 0 + %el1 = extractelement <16 x i8> %invec, i64 1 + %el2 = extractelement <16 x i8> %invec, i64 2 + %el3 = extractelement <16 x i8> %invec, i64 3 + %el4 = extractelement <16 x i8> %invec, i64 4 + %el5 = extractelement <16 x i8> %invec, i64 5 + %el6 = extractelement <16 x i8> %invec, i64 6 + %el7 = extractelement <16 x i8> %invec, i64 7 + %el8 = extractelement <16 x i8> %invec, i64 8 + %el9 = extractelement <16 x i8> %invec, i64 9 + %el10 = extractelement <16 x i8> %invec, i64 10 + %el11 = extractelement <16 x i8> %invec, i64 11 + %el12 = extractelement <16 x i8> %invec, i64 12 + %el13 = extractelement <16 x i8> %invec, i64 13 + %el14 = extractelement <16 x i8> %invec, i64 14 + %el15 = extractelement <16 x i8> %invec, i64 15 + %mul0 = mul i8 %el0, 1 + %mul1 = mul i8 %el1, 1 + %mul2 = mul i8 %el2, 1 + %mul3 = mul i8 %el3, 1 + %mul4 = mul i8 %el4, 1 + %mul5 = mul i8 %el5, 1 + %mul6 = mul i8 %el6, 1 + %mul7 = mul i8 %el7, 1 + %mul8 = mul i8 %el8, 1 + %mul9 = mul i8 %el9, 1 + %mul10 = mul i8 %el10, 1 + %mul11 = mul i8 %el11, 1 + %mul12 = mul i8 %el12, 1 + %mul13 = mul i8 %el13, 1 + %mul14 = mul i8 %el14, 1 + %mul15 = mul i8 %el15, 1 + %add0 = add i8 %mul0, 1 + %add1 = add i8 %mul1, 1 + %add2 = add i8 %mul2, 1 + %add3 = add i8 %mul3, 1 + %add4 = add i8 %mul4, 1 + %add5 = add i8 %mul5, 1 + %add6 = add i8 %mul6, 1 + %add7 = add i8 %mul7, 1 + %add8 = add i8 %mul8, 1 + %add9 = add i8 %mul9, 1 + %add10 = add i8 %mul10, 1 + %add11 = add i8 %mul11, 1 + %add12 = add i8 %mul12, 1 + %add13 = add i8 %mul13, 1 + %add14 = add i8 %mul14, 1 + %add15 = add i8 %mul15, 1 + %vecins0 = insertelement <16 x i8> poison, i8 %add0, i64 0 + %vecins1 = insertelement <16 x i8> %vecins0, i8 %add1, i64 1 + %vecins2 = insertelement <16 x i8> %vecins1, i8 %add2, i64 2 + %vecins3 = insertelement <16 x i8> %vecins2, i8 %add3, i64 3 + %vecins4 = insertelement <16 x i8> %vecins3, i8 %add4, i64 4 + %vecins5 = insertelement <16 x i8> %vecins4, i8 %add5, i64 5 + %vecins6 = insertelement <16 x i8> %vecins5, i8 %add6, i64 6 + %vecins7 = insertelement <16 x i8> %vecins6, i8 %add7, i64 7 + %vecins8 = insertelement <16 x i8> %vecins7, i8 %add8, i64 8 + %vecins9 = insertelement <16 x i8> %vecins8, i8 %add9, i64 9 + %vecins10 = insertelement <16 x i8> %vecins9, i8 %add10, i64 10 + %vecins11 = insertelement <16 x i8> %vecins10, i8 %add11, i64 11 + %vecins12 = insertelement <16 x i8> %vecins11, i8 %add12, i64 12 + %vecins13 = insertelement <16 x i8> %vecins12, i8 %add13, i64 13 + %vecins14 = insertelement <16 x i8> %vecins13, i8 %add14, i64 14 + %vecins15 = insertelement <16 x i8> %vecins14, i8 %add15, i64 15 + store <16 x i8> %vecins15, ptr %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX8: {{.*}} +; GFX9: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll index 3b63c1e35610f..03dd882059ee3 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll @@ -49,19 +49,19 @@ bb1: define <4 x half> @phis_reverse(i1 %cmp1, <4 x half> %in1, <4 x half> %in2) { ; CHECK-LABEL: @phis_reverse( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> ; CHECK-NEXT: br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]] ; CHECK: bb0: -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> ; CHECK-NEXT: br label [[BB1]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3]], [[BB0]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x half> [ [[TMP2]], [[ENTRY]] ], [ [[TMP9]], [[BB0]] ] ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> [[TMP4]], <4 x i32> ; CHECK-NEXT: ret <4 x half> [[TMP8]] ; entry: @@ -90,3 +90,108 @@ bb1: %o3 = insertelement <4 x half> %o2, half %c3, i64 3 ret <4 x half> %o3 } + + +define <4 x i8> @phisi8(i1 %cmp1, <4 x i8> %in1, <4 x i8> %in2) { +; CHECK-LABEL: @phisi8( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x i8> [[IN1:%.*]], i64 0 +; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x i8> [[IN1]], i64 1 +; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x i8> [[IN1]], i64 2 +; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x i8> [[IN1]], i64 3 +; CHECK-NEXT: br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]] +; CHECK: bb0: +; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x i8> [[IN2:%.*]], i64 0 +; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x i8> [[IN2]], i64 1 +; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x i8> [[IN2]], i64 2 +; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x i8> [[IN2]], i64 3 +; CHECK-NEXT: br label [[BB1]] +; CHECK: bb1: +; CHECK-NEXT: [[C0:%.*]] = phi i8 [ [[A0]], [[ENTRY:%.*]] ], [ [[B0]], [[BB0]] ] +; CHECK-NEXT: [[C1:%.*]] = phi i8 [ [[A1]], [[ENTRY]] ], [ [[B1]], [[BB0]] ] +; CHECK-NEXT: [[C2:%.*]] = phi i8 [ [[A2]], [[ENTRY]] ], [ [[B2]], [[BB0]] ] +; CHECK-NEXT: [[C3:%.*]] = phi i8 [ [[A3]], [[ENTRY]] ], [ [[B3]], [[BB0]] ] +; CHECK-NEXT: [[O0:%.*]] = insertelement <4 x i8> undef, i8 [[C0]], i64 0 +; CHECK-NEXT: [[O1:%.*]] = insertelement <4 x i8> [[O0]], i8 [[C1]], i64 1 +; CHECK-NEXT: [[O2:%.*]] = insertelement <4 x i8> [[O1]], i8 [[C2]], i64 2 +; CHECK-NEXT: [[O3:%.*]] = insertelement <4 x i8> [[O2]], i8 [[C3]], i64 3 +; CHECK-NEXT: ret <4 x i8> [[O3]] +; +entry: + %a0 = extractelement <4 x i8> %in1, i64 0 + %a1 = extractelement <4 x i8> %in1, i64 1 + %a2 = extractelement <4 x i8> %in1, i64 2 + %a3 = extractelement <4 x i8> %in1, i64 3 + br i1 %cmp1, label %bb1, label %bb0 + +bb0: + %b0 = extractelement <4 x i8> %in2, i64 0 + %b1 = extractelement <4 x i8> %in2, i64 1 + %b2 = extractelement <4 x i8> %in2, i64 2 + %b3 = extractelement <4 x i8> %in2, i64 3 + br label %bb1 + +bb1: + %c0 = phi i8 [ %a0, %entry ], [ %b0, %bb0 ] + %c1 = phi i8 [ %a1, %entry ], [ %b1, %bb0 ] + %c2 = phi i8 [ %a2, %entry ], [ %b2, %bb0 ] + %c3 = phi i8 [ %a3, %entry ], [ %b3, %bb0 ] + + %o0 = insertelement <4 x i8> undef, i8 %c0, i64 0 + %o1 = insertelement <4 x i8> %o0, i8 %c1, i64 1 + %o2 = insertelement <4 x i8> %o1, i8 %c2, i64 2 + %o3 = insertelement <4 x i8> %o2, i8 %c3, i64 3 + ret <4 x i8> %o3 +} + +define <4 x i8> @phisi8_reverse(i1 %cmp1, <4 x i8> %in1, <4 x i8> %in2) { +; CHECK-LABEL: @phisi8_reverse( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x i8> [[IN1:%.*]], i64 0 +; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x i8> [[IN1]], i64 1 +; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x i8> [[IN1]], i64 2 +; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x i8> [[IN1]], i64 3 +; CHECK-NEXT: br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]] +; CHECK: bb0: +; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x i8> [[IN2:%.*]], i64 0 +; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x i8> [[IN2]], i64 1 +; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x i8> [[IN2]], i64 2 +; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x i8> [[IN2]], i64 3 +; CHECK-NEXT: br label [[BB1]] +; CHECK: bb1: +; CHECK-NEXT: [[C3:%.*]] = phi i8 [ [[A3]], [[ENTRY:%.*]] ], [ [[B3]], [[BB0]] ] +; CHECK-NEXT: [[C2:%.*]] = phi i8 [ [[A2]], [[ENTRY]] ], [ [[B2]], [[BB0]] ] +; CHECK-NEXT: [[C1:%.*]] = phi i8 [ [[A1]], [[ENTRY]] ], [ [[B1]], [[BB0]] ] +; CHECK-NEXT: [[C0:%.*]] = phi i8 [ [[A0]], [[ENTRY]] ], [ [[B0]], [[BB0]] ] +; CHECK-NEXT: [[O0:%.*]] = insertelement <4 x i8> undef, i8 [[C0]], i64 0 +; CHECK-NEXT: [[O1:%.*]] = insertelement <4 x i8> [[O0]], i8 [[C1]], i64 1 +; CHECK-NEXT: [[O2:%.*]] = insertelement <4 x i8> [[O1]], i8 [[C2]], i64 2 +; CHECK-NEXT: [[O3:%.*]] = insertelement <4 x i8> [[O2]], i8 [[C3]], i64 3 +; CHECK-NEXT: ret <4 x i8> [[O3]] +; +entry: + %a0 = extractelement <4 x i8> %in1, i64 0 + %a1 = extractelement <4 x i8> %in1, i64 1 + %a2 = extractelement <4 x i8> %in1, i64 2 + %a3 = extractelement <4 x i8> %in1, i64 3 + br i1 %cmp1, label %bb1, label %bb0 + +bb0: + %b0 = extractelement <4 x i8> %in2, i64 0 + %b1 = extractelement <4 x i8> %in2, i64 1 + %b2 = extractelement <4 x i8> %in2, i64 2 + %b3 = extractelement <4 x i8> %in2, i64 3 + br label %bb1 + +bb1: + %c3 = phi i8 [ %a3, %entry ], [ %b3, %bb0 ] + %c2 = phi i8 [ %a2, %entry ], [ %b2, %bb0 ] + %c1 = phi i8 [ %a1, %entry ], [ %b1, %bb0 ] + %c0 = phi i8 [ %a0, %entry ], [ %b0, %bb0 ] + + %o0 = insertelement <4 x i8> undef, i8 %c0, i64 0 + %o1 = insertelement <4 x i8> %o0, i8 %c1, i64 1 + %o2 = insertelement <4 x i8> %o1, i8 %c2, i64 2 + %o3 = insertelement <4 x i8> %o2, i8 %c3, i64 3 + ret <4 x i8> %o3 +} diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll index b5bfdf284ca62..e7007c30e9f58 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll @@ -549,3 +549,330 @@ entry: ret float %add3 } + +define i8 @reduction_v4i8(<4 x i8> %a) { +; GCN-LABEL: @reduction_v4i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[ELT0:%.*]] = extractelement <4 x i8> [[A:%.*]], i64 0 +; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x i8> [[A]], i64 1 +; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x i8> [[A]], i64 2 +; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x i8> [[A]], i64 3 +; GCN-NEXT: [[ADD1:%.*]] = add i8 [[ELT1]], [[ELT0]] +; GCN-NEXT: [[ADD2:%.*]] = add i8 [[ELT2]], [[ADD1]] +; GCN-NEXT: [[ADD3:%.*]] = add i8 [[ELT3]], [[ADD2]] +; GCN-NEXT: ret i8 [[ADD3]] +; +entry: + %elt0 = extractelement <4 x i8> %a, i64 0 + %elt1 = extractelement <4 x i8> %a, i64 1 + %elt2 = extractelement <4 x i8> %a, i64 2 + %elt3 = extractelement <4 x i8> %a, i64 3 + + %add1 = add i8 %elt1, %elt0 + %add2 = add i8 %elt2, %add1 + %add3 = add i8 %elt3, %add2 + + ret i8 %add3 +} + +define i8 @reduction_v8i8(<8 x i8> %vec8) { +; GCN-LABEL: @reduction_v8i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[ELT0:%.*]] = extractelement <8 x i8> [[VEC8:%.*]], i64 0 +; GCN-NEXT: [[ELT1:%.*]] = extractelement <8 x i8> [[VEC8]], i64 1 +; GCN-NEXT: [[ELT2:%.*]] = extractelement <8 x i8> [[VEC8]], i64 2 +; GCN-NEXT: [[ELT3:%.*]] = extractelement <8 x i8> [[VEC8]], i64 3 +; GCN-NEXT: [[ELT4:%.*]] = extractelement <8 x i8> [[VEC8]], i64 4 +; GCN-NEXT: [[ELT5:%.*]] = extractelement <8 x i8> [[VEC8]], i64 5 +; GCN-NEXT: [[ELT6:%.*]] = extractelement <8 x i8> [[VEC8]], i64 6 +; GCN-NEXT: [[ELT7:%.*]] = extractelement <8 x i8> [[VEC8]], i64 7 +; GCN-NEXT: [[ADD1:%.*]] = add i8 [[ELT1]], [[ELT0]] +; GCN-NEXT: [[ADD2:%.*]] = add i8 [[ELT2]], [[ADD1]] +; GCN-NEXT: [[ADD3:%.*]] = add i8 [[ELT3]], [[ADD2]] +; GCN-NEXT: [[ADD4:%.*]] = add i8 [[ELT4]], [[ADD3]] +; GCN-NEXT: [[ADD5:%.*]] = add i8 [[ELT5]], [[ADD4]] +; GCN-NEXT: [[ADD6:%.*]] = add i8 [[ELT6]], [[ADD5]] +; GCN-NEXT: [[ADD7:%.*]] = add i8 [[ELT7]], [[ADD6]] +; GCN-NEXT: ret i8 [[ADD7]] +; +entry: + %elt0 = extractelement <8 x i8> %vec8, i64 0 + %elt1 = extractelement <8 x i8> %vec8, i64 1 + %elt2 = extractelement <8 x i8> %vec8, i64 2 + %elt3 = extractelement <8 x i8> %vec8, i64 3 + %elt4 = extractelement <8 x i8> %vec8, i64 4 + %elt5 = extractelement <8 x i8> %vec8, i64 5 + %elt6 = extractelement <8 x i8> %vec8, i64 6 + %elt7 = extractelement <8 x i8> %vec8, i64 7 + + %add1 = add i8 %elt1, %elt0 + %add2 = add i8 %elt2, %add1 + %add3 = add i8 %elt3, %add2 + %add4 = add i8 %elt4, %add3 + %add5 = add i8 %elt5, %add4 + %add6 = add i8 %elt6, %add5 + %add7 = add i8 %elt7, %add6 + + ret i8 %add7 +} + +define i8 @reduction_umin_v4i8(<4 x i8> %vec4) { +; GCN-LABEL: @reduction_umin_v4i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[ELT0:%.*]] = extractelement <4 x i8> [[VEC4:%.*]], i64 0 +; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x i8> [[VEC4]], i64 1 +; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x i8> [[VEC4]], i64 2 +; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x i8> [[VEC4]], i64 3 +; GCN-NEXT: [[CMP1:%.*]] = icmp ult i8 [[ELT1]], [[ELT0]] +; GCN-NEXT: [[MIN1:%.*]] = select i1 [[CMP1]], i8 [[ELT1]], i8 [[ELT0]] +; GCN-NEXT: [[CMP2:%.*]] = icmp ult i8 [[ELT2]], [[MIN1]] +; GCN-NEXT: [[MIN2:%.*]] = select i1 [[CMP2]], i8 [[ELT2]], i8 [[MIN1]] +; GCN-NEXT: [[CMP3:%.*]] = icmp ult i8 [[ELT3]], [[MIN2]] +; GCN-NEXT: [[MIN3:%.*]] = select i1 [[CMP3]], i8 [[ELT3]], i8 [[MIN2]] +; GCN-NEXT: ret i8 [[MIN3]] +; +entry: + %elt0 = extractelement <4 x i8> %vec4, i64 0 + %elt1 = extractelement <4 x i8> %vec4, i64 1 + %elt2 = extractelement <4 x i8> %vec4, i64 2 + %elt3 = extractelement <4 x i8> %vec4, i64 3 + + %cmp1 = icmp ult i8 %elt1, %elt0 + %min1 = select i1 %cmp1, i8 %elt1, i8 %elt0 + %cmp2 = icmp ult i8 %elt2, %min1 + %min2 = select i1 %cmp2, i8 %elt2, i8 %min1 + %cmp3 = icmp ult i8 %elt3, %min2 + %min3 = select i1 %cmp3, i8 %elt3, i8 %min2 + + ret i8 %min3 +} + +define i8 @reduction_icmp_v8i8(<8 x i8> %vec8) { +; GCN-LABEL: @reduction_icmp_v8i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[ELT0:%.*]] = extractelement <8 x i8> [[VEC8:%.*]], i64 0 +; GCN-NEXT: [[ELT1:%.*]] = extractelement <8 x i8> [[VEC8]], i64 1 +; GCN-NEXT: [[ELT2:%.*]] = extractelement <8 x i8> [[VEC8]], i64 2 +; GCN-NEXT: [[ELT3:%.*]] = extractelement <8 x i8> [[VEC8]], i64 3 +; GCN-NEXT: [[ELT4:%.*]] = extractelement <8 x i8> [[VEC8]], i64 4 +; GCN-NEXT: [[ELT5:%.*]] = extractelement <8 x i8> [[VEC8]], i64 5 +; GCN-NEXT: [[ELT6:%.*]] = extractelement <8 x i8> [[VEC8]], i64 6 +; GCN-NEXT: [[ELT7:%.*]] = extractelement <8 x i8> [[VEC8]], i64 7 +; GCN-NEXT: [[CMP0:%.*]] = icmp ult i8 [[ELT1]], [[ELT0]] +; GCN-NEXT: [[MIN1:%.*]] = select i1 [[CMP0]], i8 [[ELT1]], i8 [[ELT0]] +; GCN-NEXT: [[CMP1:%.*]] = icmp ult i8 [[ELT2]], [[MIN1]] +; GCN-NEXT: [[MIN2:%.*]] = select i1 [[CMP1]], i8 [[ELT2]], i8 [[MIN1]] +; GCN-NEXT: [[CMP2:%.*]] = icmp ult i8 [[ELT3]], [[MIN2]] +; GCN-NEXT: [[MIN3:%.*]] = select i1 [[CMP2]], i8 [[ELT3]], i8 [[MIN2]] +; GCN-NEXT: [[CMP3:%.*]] = icmp ult i8 [[ELT4]], [[MIN3]] +; GCN-NEXT: [[MIN4:%.*]] = select i1 [[CMP3]], i8 [[ELT4]], i8 [[MIN3]] +; GCN-NEXT: [[CMP4:%.*]] = icmp ult i8 [[ELT5]], [[MIN4]] +; GCN-NEXT: [[MIN5:%.*]] = select i1 [[CMP4]], i8 [[ELT5]], i8 [[MIN4]] +; GCN-NEXT: [[CMP5:%.*]] = icmp ult i8 [[ELT6]], [[MIN5]] +; GCN-NEXT: [[MIN6:%.*]] = select i1 [[CMP5]], i8 [[ELT6]], i8 [[MIN5]] +; GCN-NEXT: [[CMP6:%.*]] = icmp ult i8 [[ELT7]], [[MIN6]] +; GCN-NEXT: [[MIN7:%.*]] = select i1 [[CMP6]], i8 [[ELT7]], i8 [[MIN6]] +; GCN-NEXT: ret i8 [[MIN7]] +; +entry: + %elt0 = extractelement <8 x i8> %vec8, i64 0 + %elt1 = extractelement <8 x i8> %vec8, i64 1 + %elt2 = extractelement <8 x i8> %vec8, i64 2 + %elt3 = extractelement <8 x i8> %vec8, i64 3 + %elt4 = extractelement <8 x i8> %vec8, i64 4 + %elt5 = extractelement <8 x i8> %vec8, i64 5 + %elt6 = extractelement <8 x i8> %vec8, i64 6 + %elt7 = extractelement <8 x i8> %vec8, i64 7 + + %cmp0 = icmp ult i8 %elt1, %elt0 + %min1 = select i1 %cmp0, i8 %elt1, i8 %elt0 + %cmp1 = icmp ult i8 %elt2, %min1 + %min2 = select i1 %cmp1, i8 %elt2, i8 %min1 + %cmp2 = icmp ult i8 %elt3, %min2 + %min3 = select i1 %cmp2, i8 %elt3, i8 %min2 + + %cmp3 = icmp ult i8 %elt4, %min3 + %min4 = select i1 %cmp3, i8 %elt4, i8 %min3 + %cmp4 = icmp ult i8 %elt5, %min4 + %min5 = select i1 %cmp4, i8 %elt5, i8 %min4 + + %cmp5 = icmp ult i8 %elt6, %min5 + %min6 = select i1 %cmp5, i8 %elt6, i8 %min5 + %cmp6 = icmp ult i8 %elt7, %min6 + %min7 = select i1 %cmp6, i8 %elt7, i8 %min6 + + ret i8 %min7 +} + +define i8 @reduction_smin_v16i8(<16 x i8> %vec16) { +; GCN-LABEL: @reduction_smin_v16i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[ELT0:%.*]] = extractelement <16 x i8> [[VEC16:%.*]], i64 0 +; GCN-NEXT: [[ELT1:%.*]] = extractelement <16 x i8> [[VEC16]], i64 1 +; GCN-NEXT: [[ELT2:%.*]] = extractelement <16 x i8> [[VEC16]], i64 2 +; GCN-NEXT: [[ELT3:%.*]] = extractelement <16 x i8> [[VEC16]], i64 3 +; GCN-NEXT: [[ELT4:%.*]] = extractelement <16 x i8> [[VEC16]], i64 4 +; GCN-NEXT: [[ELT5:%.*]] = extractelement <16 x i8> [[VEC16]], i64 5 +; GCN-NEXT: [[ELT6:%.*]] = extractelement <16 x i8> [[VEC16]], i64 6 +; GCN-NEXT: [[ELT7:%.*]] = extractelement <16 x i8> [[VEC16]], i64 7 +; GCN-NEXT: [[ELT8:%.*]] = extractelement <16 x i8> [[VEC16]], i64 8 +; GCN-NEXT: [[ELT9:%.*]] = extractelement <16 x i8> [[VEC16]], i64 9 +; GCN-NEXT: [[ELT10:%.*]] = extractelement <16 x i8> [[VEC16]], i64 10 +; GCN-NEXT: [[ELT11:%.*]] = extractelement <16 x i8> [[VEC16]], i64 11 +; GCN-NEXT: [[ELT12:%.*]] = extractelement <16 x i8> [[VEC16]], i64 12 +; GCN-NEXT: [[ELT13:%.*]] = extractelement <16 x i8> [[VEC16]], i64 13 +; GCN-NEXT: [[ELT14:%.*]] = extractelement <16 x i8> [[VEC16]], i64 14 +; GCN-NEXT: [[ELT15:%.*]] = extractelement <16 x i8> [[VEC16]], i64 15 +; GCN-NEXT: [[CMP0:%.*]] = icmp slt i8 [[ELT1]], [[ELT0]] +; GCN-NEXT: [[MIN1:%.*]] = select i1 [[CMP0]], i8 [[ELT1]], i8 [[ELT0]] +; GCN-NEXT: [[CMP1:%.*]] = icmp slt i8 [[ELT2]], [[MIN1]] +; GCN-NEXT: [[MIN2:%.*]] = select i1 [[CMP1]], i8 [[ELT2]], i8 [[MIN1]] +; GCN-NEXT: [[CMP2:%.*]] = icmp slt i8 [[ELT3]], [[MIN2]] +; GCN-NEXT: [[MIN3:%.*]] = select i1 [[CMP2]], i8 [[ELT3]], i8 [[MIN2]] +; GCN-NEXT: [[CMP3:%.*]] = icmp slt i8 [[ELT4]], [[MIN3]] +; GCN-NEXT: [[MIN4:%.*]] = select i1 [[CMP3]], i8 [[ELT4]], i8 [[MIN3]] +; GCN-NEXT: [[CMP4:%.*]] = icmp slt i8 [[ELT5]], [[MIN4]] +; GCN-NEXT: [[MIN5:%.*]] = select i1 [[CMP4]], i8 [[ELT5]], i8 [[MIN4]] +; GCN-NEXT: [[CMP5:%.*]] = icmp slt i8 [[ELT6]], [[MIN5]] +; GCN-NEXT: [[MIN6:%.*]] = select i1 [[CMP5]], i8 [[ELT6]], i8 [[MIN5]] +; GCN-NEXT: [[CMP6:%.*]] = icmp slt i8 [[ELT7]], [[MIN6]] +; GCN-NEXT: [[MIN7:%.*]] = select i1 [[CMP6]], i8 [[ELT7]], i8 [[MIN6]] +; GCN-NEXT: [[CMP7:%.*]] = icmp slt i8 [[ELT8]], [[MIN7]] +; GCN-NEXT: [[MIN8:%.*]] = select i1 [[CMP7]], i8 [[ELT8]], i8 [[MIN7]] +; GCN-NEXT: [[CMP8:%.*]] = icmp slt i8 [[ELT9]], [[MIN8]] +; GCN-NEXT: [[MIN9:%.*]] = select i1 [[CMP8]], i8 [[ELT9]], i8 [[MIN8]] +; GCN-NEXT: [[CMP9:%.*]] = icmp slt i8 [[ELT10]], [[MIN9]] +; GCN-NEXT: [[MIN10:%.*]] = select i1 [[CMP9]], i8 [[ELT10]], i8 [[MIN9]] +; GCN-NEXT: [[CMP10:%.*]] = icmp slt i8 [[ELT11]], [[MIN10]] +; GCN-NEXT: [[MIN11:%.*]] = select i1 [[CMP10]], i8 [[ELT11]], i8 [[MIN10]] +; GCN-NEXT: [[CMP11:%.*]] = icmp slt i8 [[ELT12]], [[MIN11]] +; GCN-NEXT: [[MIN12:%.*]] = select i1 [[CMP11]], i8 [[ELT12]], i8 [[MIN11]] +; GCN-NEXT: [[CMP12:%.*]] = icmp slt i8 [[ELT13]], [[MIN12]] +; GCN-NEXT: [[MIN13:%.*]] = select i1 [[CMP12]], i8 [[ELT13]], i8 [[MIN12]] +; GCN-NEXT: [[CMP13:%.*]] = icmp slt i8 [[ELT14]], [[MIN13]] +; GCN-NEXT: [[MIN14:%.*]] = select i1 [[CMP13]], i8 [[ELT14]], i8 [[MIN13]] +; GCN-NEXT: [[CMP14:%.*]] = icmp slt i8 [[ELT15]], [[MIN14]] +; GCN-NEXT: [[MIN15:%.*]] = select i1 [[CMP14]], i8 [[ELT15]], i8 [[MIN14]] +; GCN-NEXT: ret i8 [[MIN15]] +; +entry: + %elt0 = extractelement <16 x i8> %vec16, i64 0 + %elt1 = extractelement <16 x i8> %vec16, i64 1 + %elt2 = extractelement <16 x i8> %vec16, i64 2 + %elt3 = extractelement <16 x i8> %vec16, i64 3 + %elt4 = extractelement <16 x i8> %vec16, i64 4 + %elt5 = extractelement <16 x i8> %vec16, i64 5 + %elt6 = extractelement <16 x i8> %vec16, i64 6 + %elt7 = extractelement <16 x i8> %vec16, i64 7 + + %elt8 = extractelement <16 x i8> %vec16, i64 8 + %elt9 = extractelement <16 x i8> %vec16, i64 9 + %elt10 = extractelement <16 x i8> %vec16, i64 10 + %elt11 = extractelement <16 x i8> %vec16, i64 11 + %elt12 = extractelement <16 x i8> %vec16, i64 12 + %elt13 = extractelement <16 x i8> %vec16, i64 13 + %elt14 = extractelement <16 x i8> %vec16, i64 14 + %elt15 = extractelement <16 x i8> %vec16, i64 15 + + %cmp0 = icmp slt i8 %elt1, %elt0 + %min1 = select i1 %cmp0, i8 %elt1, i8 %elt0 + %cmp1 = icmp slt i8 %elt2, %min1 + %min2 = select i1 %cmp1, i8 %elt2, i8 %min1 + %cmp2 = icmp slt i8 %elt3, %min2 + %min3 = select i1 %cmp2, i8 %elt3, i8 %min2 + + %cmp3 = icmp slt i8 %elt4, %min3 + %min4 = select i1 %cmp3, i8 %elt4, i8 %min3 + %cmp4 = icmp slt i8 %elt5, %min4 + %min5 = select i1 %cmp4, i8 %elt5, i8 %min4 + + %cmp5 = icmp slt i8 %elt6, %min5 + %min6 = select i1 %cmp5, i8 %elt6, i8 %min5 + %cmp6 = icmp slt i8 %elt7, %min6 + %min7 = select i1 %cmp6, i8 %elt7, i8 %min6 + + %cmp7 = icmp slt i8 %elt8, %min7 + %min8 = select i1 %cmp7, i8 %elt8, i8 %min7 + %cmp8 = icmp slt i8 %elt9, %min8 + %min9 = select i1 %cmp8, i8 %elt9, i8 %min8 + + %cmp9 = icmp slt i8 %elt10, %min9 + %min10 = select i1 %cmp9, i8 %elt10, i8 %min9 + %cmp10 = icmp slt i8 %elt11, %min10 + %min11 = select i1 %cmp10, i8 %elt11, i8 %min10 + + %cmp11 = icmp slt i8 %elt12, %min11 + %min12 = select i1 %cmp11, i8 %elt12, i8 %min11 + %cmp12 = icmp slt i8 %elt13, %min12 + %min13 = select i1 %cmp12, i8 %elt13, i8 %min12 + + %cmp13 = icmp slt i8 %elt14, %min13 + %min14 = select i1 %cmp13, i8 %elt14, i8 %min13 + %cmp14 = icmp slt i8 %elt15, %min14 + %min15 = select i1 %cmp14, i8 %elt15, i8 %min14 + + + ret i8 %min15 +} + +define i8 @reduction_umax_v4i8(<4 x i8> %vec4) { +; GCN-LABEL: @reduction_umax_v4i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[ELT0:%.*]] = extractelement <4 x i8> [[VEC4:%.*]], i64 0 +; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x i8> [[VEC4]], i64 1 +; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x i8> [[VEC4]], i64 2 +; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x i8> [[VEC4]], i64 3 +; GCN-NEXT: [[CMP1:%.*]] = icmp ugt i8 [[ELT1]], [[ELT0]] +; GCN-NEXT: [[MAX1:%.*]] = select i1 [[CMP1]], i8 [[ELT1]], i8 [[ELT0]] +; GCN-NEXT: [[CMP2:%.*]] = icmp ugt i8 [[ELT2]], [[MAX1]] +; GCN-NEXT: [[MAX2:%.*]] = select i1 [[CMP2]], i8 [[ELT2]], i8 [[MAX1]] +; GCN-NEXT: [[CMP3:%.*]] = icmp ugt i8 [[ELT3]], [[MAX2]] +; GCN-NEXT: [[MAX3:%.*]] = select i1 [[CMP3]], i8 [[ELT3]], i8 [[MAX2]] +; GCN-NEXT: ret i8 [[MAX3]] +; +entry: + %elt0 = extractelement <4 x i8> %vec4, i64 0 + %elt1 = extractelement <4 x i8> %vec4, i64 1 + %elt2 = extractelement <4 x i8> %vec4, i64 2 + %elt3 = extractelement <4 x i8> %vec4, i64 3 + + %cmp1 = icmp ugt i8 %elt1, %elt0 + %max1 = select i1 %cmp1, i8 %elt1, i8 %elt0 + %cmp2 = icmp ugt i8 %elt2, %max1 + %max2 = select i1 %cmp2, i8 %elt2, i8 %max1 + %cmp3 = icmp ugt i8 %elt3, %max2 + %max3 = select i1 %cmp3, i8 %elt3, i8 %max2 + + ret i8 %max3 +} + +define i8 @reduction_smax_v4i8(<4 x i8> %vec4) { +; GCN-LABEL: @reduction_smax_v4i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[ELT0:%.*]] = extractelement <4 x i8> [[VEC4:%.*]], i64 0 +; GCN-NEXT: [[ELT1:%.*]] = extractelement <4 x i8> [[VEC4]], i64 1 +; GCN-NEXT: [[ELT2:%.*]] = extractelement <4 x i8> [[VEC4]], i64 2 +; GCN-NEXT: [[ELT3:%.*]] = extractelement <4 x i8> [[VEC4]], i64 3 +; GCN-NEXT: [[CMP1:%.*]] = icmp sgt i8 [[ELT1]], [[ELT0]] +; GCN-NEXT: [[MAX1:%.*]] = select i1 [[CMP1]], i8 [[ELT1]], i8 [[ELT0]] +; GCN-NEXT: [[CMP2:%.*]] = icmp sgt i8 [[ELT2]], [[MAX1]] +; GCN-NEXT: [[MAX2:%.*]] = select i1 [[CMP2]], i8 [[ELT2]], i8 [[MAX1]] +; GCN-NEXT: [[CMP3:%.*]] = icmp sgt i8 [[ELT3]], [[MAX2]] +; GCN-NEXT: [[MAX3:%.*]] = select i1 [[CMP3]], i8 [[ELT3]], i8 [[MAX2]] +; GCN-NEXT: ret i8 [[MAX3]] +; +entry: + %elt0 = extractelement <4 x i8> %vec4, i64 0 + %elt1 = extractelement <4 x i8> %vec4, i64 1 + %elt2 = extractelement <4 x i8> %vec4, i64 2 + %elt3 = extractelement <4 x i8> %vec4, i64 3 + + %cmp1 = icmp sgt i8 %elt1, %elt0 + %max1 = select i1 %cmp1, i8 %elt1, i8 %elt0 + %cmp2 = icmp sgt i8 %elt2, %max1 + %max2 = select i1 %cmp2, i8 %elt2, i8 %max1 + %cmp3 = icmp sgt i8 %elt3, %max2 + %max3 = select i1 %cmp3, i8 %elt3, i8 %max2 + + ret i8 %max3 +} diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll index dd7a21198ac1f..651f565412830 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll @@ -141,7 +141,7 @@ define ptr @test4() { ; POWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer ; POWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> ; POWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> -; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> +; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> ; POWEROF2-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0) ; POWEROF2-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2) ; POWEROF2-NEXT: br label [[TMP8:%.*]] @@ -156,10 +156,10 @@ define ptr @test4() { ; POWEROF2-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer ; POWEROF2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2) ; POWEROF2-NEXT: [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]] -; POWEROF2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 -; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP16]] ; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 -; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00 +; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]] +; POWEROF2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 +; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00 ; POWEROF2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 ; POWEROF2-NEXT: [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]] ; POWEROF2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll index 28bab3276c47d..6942df532ae29 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll @@ -7,9 +7,8 @@ define void @foo(double %i) { ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> , double [[I]], i32 2 ; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> zeroinitializer, [[TMP3]] ; CHECK-NEXT: [[I82:%.*]] = fsub double 0.000000e+00, poison +; CHECK-NEXT: [[I103:%.*]] = fsub double 0.000000e+00, [[I]] ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> , <8 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[I82]], i32 2 @@ -22,13 +21,11 @@ define void @foo(double %i) { ; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP16]]) ; CHECK-NEXT: br i1 [[TMP17]], label [[BB58:%.*]], label [[BB115:%.*]] ; CHECK: bb115: -; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x double> zeroinitializer, [[TMP4]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[TMP18]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = fmul double 0.000000e+00, [[I103]] +; CHECK-NEXT: [[TMP20:%.*]] = fmul double 0.000000e+00, [[I82]] ; CHECK-NEXT: [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]] ; CHECK-NEXT: [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP22]], <4 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x double> , double [[I82]], i32 3 ; CHECK-NEXT: [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]] ; CHECK-NEXT: [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer ; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll index 0ed12760b563f..2c583d744f3d7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll @@ -25,7 +25,6 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] -; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] ; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] @@ -34,7 +33,6 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] ; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 ; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 @@ -42,17 +40,20 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] ; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 ; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2 -; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 -; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T27]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T47]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T40]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[T701:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 -; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], splat (i32 3) +; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T71]], splat (i32 3) ; CHECK-NEXT: store <8 x i32> [[T76]], ptr [[T2]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll index 385e37e2750d1..7c2d8b69bfbb3 100644 --- a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll +++ b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll @@ -38,7 +38,7 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0 ; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i32 [[TMP16]], 1 ; CHECK-NEXT: br i1 [[TMP17]], label [[BB62:%.*]], label [[FLOW:%.*]] ; CHECK: Flow1: -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[INC_I:%.*]], [[INCREMENT_I:%.*]] ], [ undef, [[BB62]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[INC_I:%.*]], [[INCREMENT_I:%.*]] ], [ poison, [[BB62]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, [[INCREMENT_I]] ], [ true, [[BB62]] ] ; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ true, [[INCREMENT_I]] ], [ false, [[BB62]] ] ; CHECK-NEXT: br label [[FLOW]] @@ -74,7 +74,7 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0 ; CHECK-NEXT: call void asm sideeffect "s_nop 42", "~{memory}"() #[[ATTR0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: Flow: -; CHECK-NEXT: [[TMP6]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ] +; CHECK-NEXT: [[TMP6]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ poison, [[LOOP_HEADER]] ] ; CHECK-NEXT: [[TMP7]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ] ; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ] ; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ] diff --git a/llvm/test/Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll b/llvm/test/Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll index 34c73ab8fd74f..7a2ba286eac1b 100644 --- a/llvm/test/Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll +++ b/llvm/test/Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll @@ -7,9 +7,9 @@ define amdgpu_cs void @uniform(i32 inreg %v) { ; CHECK-LABEL: @uniform( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CC:%.*]] = icmp eq i32 [[V:%.*]], 0 -; CHECK-NEXT: br i1 [[CC]], label [[IF:%.*]], label [[END:%.*]], !structurizecfg.uniform !0 +; CHECK-NEXT: br i1 [[CC]], label [[IF:%.*]], label [[END:%.*]], !structurizecfg.uniform [[META0:![0-9]+]] ; CHECK: if: -; CHECK-NEXT: br label [[END]], !structurizecfg.uniform !0 +; CHECK-NEXT: br label [[END]], !structurizecfg.uniform [[META0]] ; CHECK: end: ; CHECK-NEXT: ret void ; @@ -37,14 +37,14 @@ define amdgpu_cs void @nonuniform(ptr addrspace(4) %ptr) { ; CHECK-NEXT: [[CC2:%.*]] = icmp eq i32 [[V]], 0 ; CHECK-NEXT: br i1 [[CC2]], label [[END_LOOP:%.*]], label [[FLOW1:%.*]] ; CHECK: Flow: -; CHECK-NEXT: [[TMP0]] = phi i32 [ [[TMP2:%.*]], [[FLOW1]] ], [ undef, [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP0]] = phi i32 [ [[TMP2:%.*]], [[FLOW1]] ], [ poison, [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP3:%.*]], [[FLOW1]] ], [ true, [[FOR_BODY]] ] ; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: end.loop: ; CHECK-NEXT: [[I_INC:%.*]] = add i32 [[I]], 1 ; CHECK-NEXT: br label [[FLOW1]] ; CHECK: Flow1: -; CHECK-NEXT: [[TMP2]] = phi i32 [ [[I_INC]], [[END_LOOP]] ], [ undef, [[MID_LOOP]] ] +; CHECK-NEXT: [[TMP2]] = phi i32 [ [[I_INC]], [[END_LOOP]] ], [ poison, [[MID_LOOP]] ] ; CHECK-NEXT: [[TMP3]] = phi i1 [ false, [[END_LOOP]] ], [ true, [[MID_LOOP]] ] ; CHECK-NEXT: br label [[FLOW]] ; CHECK: for.end: @@ -85,7 +85,7 @@ define amdgpu_cs void @uniform_branch_to_nonuniform_subregions(ptr addrspace(4) ; CHECK-LABEL: @uniform_branch_to_nonuniform_subregions( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[DATA:%.*]], 42 -; CHECK-NEXT: br i1 [[C]], label [[UNIFORM_FOR_BODY:%.*]], label [[FOR_BODY:%.*]], !structurizecfg.uniform !0 +; CHECK-NEXT: br i1 [[C]], label [[UNIFORM_FOR_BODY:%.*]], label [[FOR_BODY:%.*]], !structurizecfg.uniform [[META0]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[FLOW1:%.*]] ] ; CHECK-NEXT: [[CC:%.*]] = icmp ult i32 [[I]], 4 @@ -95,14 +95,14 @@ define amdgpu_cs void @uniform_branch_to_nonuniform_subregions(ptr addrspace(4) ; CHECK-NEXT: [[CC2:%.*]] = icmp eq i32 [[V]], 0 ; CHECK-NEXT: br i1 [[CC2]], label [[END_LOOP:%.*]], label [[FLOW2:%.*]] ; CHECK: Flow1: -; CHECK-NEXT: [[TMP0]] = phi i32 [ [[TMP2:%.*]], [[FLOW2]] ], [ undef, [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP0]] = phi i32 [ [[TMP2:%.*]], [[FLOW2]] ], [ poison, [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP3:%.*]], [[FLOW2]] ], [ true, [[FOR_BODY]] ] ; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: end.loop: ; CHECK-NEXT: [[I_INC:%.*]] = add i32 [[I]], 1 ; CHECK-NEXT: br label [[FLOW2]] ; CHECK: Flow2: -; CHECK-NEXT: [[TMP2]] = phi i32 [ [[I_INC]], [[END_LOOP]] ], [ undef, [[MID_LOOP]] ] +; CHECK-NEXT: [[TMP2]] = phi i32 [ [[I_INC]], [[END_LOOP]] ], [ poison, [[MID_LOOP]] ] ; CHECK-NEXT: [[TMP3]] = phi i1 [ false, [[END_LOOP]] ], [ true, [[MID_LOOP]] ] ; CHECK-NEXT: br label [[FLOW1]] ; CHECK: for.end: @@ -118,14 +118,14 @@ define amdgpu_cs void @uniform_branch_to_nonuniform_subregions(ptr addrspace(4) ; CHECK-NEXT: [[UNIFORM_CC2:%.*]] = icmp eq i32 [[UNIFORM_V]], 0 ; CHECK-NEXT: br i1 [[UNIFORM_CC2]], label [[UNIFORM_END_LOOP:%.*]], label [[FLOW5:%.*]] ; CHECK: Flow4: -; CHECK-NEXT: [[TMP4]] = phi i32 [ [[TMP6:%.*]], [[FLOW5]] ], [ undef, [[UNIFORM_FOR_BODY]] ] +; CHECK-NEXT: [[TMP4]] = phi i32 [ [[TMP6:%.*]], [[FLOW5]] ], [ poison, [[UNIFORM_FOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP7:%.*]], [[FLOW5]] ], [ true, [[UNIFORM_FOR_BODY]] ] ; CHECK-NEXT: br i1 [[TMP5]], label [[UNIFORM_FOR_END:%.*]], label [[UNIFORM_FOR_BODY]] ; CHECK: uniform.end.loop: ; CHECK-NEXT: [[UNIFORM_I_INC:%.*]] = add i32 [[UNIFORM_I]], 1 ; CHECK-NEXT: br label [[FLOW5]] ; CHECK: Flow5: -; CHECK-NEXT: [[TMP6]] = phi i32 [ [[UNIFORM_I_INC]], [[UNIFORM_END_LOOP]] ], [ undef, [[UNIFORM_MID_LOOP]] ] +; CHECK-NEXT: [[TMP6]] = phi i32 [ [[UNIFORM_I_INC]], [[UNIFORM_END_LOOP]] ], [ poison, [[UNIFORM_MID_LOOP]] ] ; CHECK-NEXT: [[TMP7]] = phi i1 [ false, [[UNIFORM_END_LOOP]] ], [ true, [[UNIFORM_MID_LOOP]] ] ; CHECK-NEXT: br label [[FLOW4]] ; CHECK: uniform.for.end: diff --git a/llvm/test/Transforms/StructurizeCFG/interleaved-loop-order.ll b/llvm/test/Transforms/StructurizeCFG/interleaved-loop-order.ll index 3ca70dab27193..91e88b9212e7a 100644 --- a/llvm/test/Transforms/StructurizeCFG/interleaved-loop-order.ll +++ b/llvm/test/Transforms/StructurizeCFG/interleaved-loop-order.ll @@ -18,7 +18,6 @@ define i1 @test_nested(i32 %x, i1 %b1, i1 %b2, i1 %b3) { ; CHECK: exit.true: ; CHECK-NEXT: br label [[FLOW13]] ; CHECK: Flow13: -; CHECK-NEXT: [[TMP0:%.*]] = phi i1 [ true, [[EXIT_TRUE]] ], [ undef, [[FLOW12:%.*]] ] ; CHECK-NEXT: br i1 [[TMP2:%.*]], label [[EXIT_FALSE:%.*]], label [[EXIT:%.*]] ; CHECK: exit.false: ; CHECK-NEXT: br label [[EXIT]] @@ -30,7 +29,7 @@ define i1 @test_nested(i32 %x, i1 %b1, i1 %b2, i1 %b3) { ; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP16:%.*]], [[FLOW11:%.*]] ], [ true, [[OUTER_LOOP_HEADER]] ] ; CHECK-NEXT: [[TMP2]] = phi i1 [ [[TMP12:%.*]], [[FLOW11]] ], [ false, [[OUTER_LOOP_HEADER]] ] ; CHECK-NEXT: [[TMP3]] = phi i1 [ false, [[FLOW11]] ], [ true, [[OUTER_LOOP_HEADER]] ] -; CHECK-NEXT: br i1 [[TMP1]], label [[FLOW12]], label [[OUTER_LOOP_HEADER]] +; CHECK-NEXT: br i1 [[TMP1]], label [[FLOW12:%.*]], label [[OUTER_LOOP_HEADER]] ; CHECK: inner.loop.header: ; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP8:%.*]], [[FLOW4:%.*]] ], [ false, [[OUTER_LOOP_BODY]] ] ; CHECK-NEXT: br i1 [[B2:%.*]], label [[INNER_LOOP_BODY:%.*]], label [[FLOW4]] @@ -95,7 +94,7 @@ define i1 @test_nested(i32 %x, i1 %b1, i1 %b2, i1 %b3) { ; CHECK: inner.loop.latch: ; CHECK-NEXT: br label [[FLOW6]] ; CHECK: exit: -; CHECK-NEXT: [[R:%.*]] = phi i1 [ [[TMP0]], [[FLOW13]] ], [ false, [[EXIT_FALSE]] ] +; CHECK-NEXT: [[R:%.*]] = phi i1 [ true, [[FLOW13]] ], [ false, [[EXIT_FALSE]] ] ; CHECK-NEXT: ret i1 [[R]] ; entry: diff --git a/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll b/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll index 46881ec827286..799d6cc8655af 100644 --- a/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll +++ b/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll @@ -8,14 +8,13 @@ define float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 { ; CHECK-NEXT: br label %[[HEADER:.*]] ; CHECK: [[HEADER]]: ; CHECK-NEXT: [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[FLOW2:.*]] ] -; CHECK-NEXT: [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[FLOW2]] ] +; CHECK-NEXT: [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP4:%.*]], %[[FLOW2]] ] ; CHECK-NEXT: [[CC:%.*]] = icmp sge i32 [[IND]], [[X]] ; CHECK-NEXT: br i1 [[CC]], label %[[ELSE:.*]], label %[[FLOW:.*]] ; CHECK: [[FLOW]]: -; CHECK-NEXT: [[TMP0:%.*]] = phi float [ [[V_1]], %[[ELSE]] ], [ undef, %[[HEADER]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ [[CC2:%.*]], %[[ELSE]] ], [ false, %[[HEADER]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ false, %[[ELSE]] ], [ true, %[[HEADER]] ] -; CHECK-NEXT: br i1 [[TMP2]], label %[[IF:.*]], label %[[FLOW1:.*]] +; CHECK-NEXT: [[TMP0:%.*]] = phi i1 [ [[CC2:%.*]], %[[ELSE]] ], [ false, %[[HEADER]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[ELSE]] ], [ true, %[[HEADER]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[IF:.*]], label %[[FLOW1:.*]] ; CHECK: [[IF]]: ; CHECK-NEXT: [[V_IF:%.*]] = fadd float [[V_1]], 1.000000e+00 ; CHECK-NEXT: br label %[[FLOW1]] @@ -23,17 +22,17 @@ define float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 { ; CHECK-NEXT: [[CC2]] = icmp slt i32 [[IND]], [[Y]] ; CHECK-NEXT: br label %[[FLOW]] ; CHECK: [[FLOW1]]: -; CHECK-NEXT: [[TMP8]] = phi float [ [[V_IF]], %[[IF]] ], [ [[TMP0]], %[[FLOW]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ true, %[[IF]] ], [ [[TMP1]], %[[FLOW]] ] -; CHECK-NEXT: br i1 [[TMP4]], label %[[LATCH:.*]], label %[[FLOW2]] +; CHECK-NEXT: [[TMP8]] = phi float [ [[V_IF]], %[[IF]] ], [ [[V_1]], %[[FLOW]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ true, %[[IF]] ], [ [[TMP0]], %[[FLOW]] ] +; CHECK-NEXT: br i1 [[TMP3]], label %[[LATCH:.*]], label %[[FLOW2]] ; CHECK: [[LATCH]]: ; CHECK-NEXT: [[IND_INC:%.*]] = add i32 [[IND]], 1 ; CHECK-NEXT: [[CC3:%.*]] = icmp slt i32 [[IND]], [[Z]] ; CHECK-NEXT: br label %[[FLOW2]] ; CHECK: [[FLOW2]]: -; CHECK-NEXT: [[TMP5]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ] -; CHECK-NEXT: br i1 [[TMP6]], label %[[END:.*]], label %[[HEADER]] +; CHECK-NEXT: [[TMP4]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ poison, %[[FLOW1]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ] +; CHECK-NEXT: br i1 [[TMP5]], label %[[END:.*]], label %[[HEADER]] ; CHECK: [[END]]: ; CHECK-NEXT: ret float [[TMP8]] ; @@ -80,7 +79,7 @@ define float @while_break2(i32 %z, float %v, i32 %x, i32 %y) #0 { ; CHECK-NEXT: [[V_IF:%.*]] = fadd float [[V_1]], 1.000000e+00 ; CHECK-NEXT: br label %[[FLOW]] ; CHECK: [[FLOW]]: -; CHECK-NEXT: [[TMP0:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ undef, %[[HEADER]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ poison, %[[HEADER]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ true, %[[IF]] ], [ false, %[[HEADER]] ] ; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ false, %[[IF]] ], [ true, %[[HEADER]] ] ; CHECK-NEXT: br i1 [[TMP2]], label %[[ELSE:.*]], label %[[FLOW1:.*]] @@ -96,7 +95,7 @@ define float @while_break2(i32 %z, float %v, i32 %x, i32 %y) #0 { ; CHECK-NEXT: [[CC3:%.*]] = icmp slt i32 [[IND]], [[Z]] ; CHECK-NEXT: br label %[[FLOW2]] ; CHECK: [[FLOW2]]: -; CHECK-NEXT: [[TMP5]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ] +; CHECK-NEXT: [[TMP5]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ poison, %[[FLOW1]] ] ; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ] ; CHECK-NEXT: br i1 [[TMP6]], label %[[END:.*]], label %[[HEADER]] ; CHECK: [[END]]: @@ -159,7 +158,7 @@ define < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i32 %y, i32 ; CHECK-NEXT: [[CC3:%.*]] = icmp slt i32 [[IND]], [[Z]] ; CHECK-NEXT: br label %[[FLOW1]] ; CHECK: [[FLOW1]]: -; CHECK-NEXT: [[TMP3]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW]] ] +; CHECK-NEXT: [[TMP3]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ poison, %[[FLOW]] ] ; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW]] ] ; CHECK-NEXT: br i1 [[TMP4]], label %[[END:.*]], label %[[HEADER]] ; CHECK: [[END]]: diff --git a/llvm/test/Transforms/StructurizeCFG/loop-continue-phi.ll b/llvm/test/Transforms/StructurizeCFG/loop-continue-phi.ll index eec67e67b540d..0effbed6e311c 100644 --- a/llvm/test/Transforms/StructurizeCFG/loop-continue-phi.ll +++ b/llvm/test/Transforms/StructurizeCFG/loop-continue-phi.ll @@ -4,22 +4,21 @@ define void @test1(i1 %arg) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: -; CHECK-NEXT: %arg.inv = xor i1 %arg, true -; CHECK-NEXT: br label %loop +; CHECK-NEXT: [[ARG_INV:%.*]] = xor i1 [[ARG:%.*]], true +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: Flow: -; CHECK-NEXT: br label %Flow1 +; CHECK-NEXT: br label [[FLOW1:%.*]] ; CHECK: loop: -; CHECK-NEXT: %ctr = phi i32 [ 0, %entry ], [ %0, %Flow1 ] -; CHECK-NEXT: %ctr.next = add i32 %ctr, 1 -; CHECK-NEXT: br i1 %arg.inv, label %loop.a, label %Flow1 +; CHECK-NEXT: [[CTR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[CTR_NEXT:%.*]], [[FLOW1]] ] +; CHECK-NEXT: [[CTR_NEXT]] = add i32 [[CTR]], 1 +; CHECK-NEXT: br i1 [[ARG_INV]], label [[LOOP_A:%.*]], label [[FLOW1]] ; CHECK: loop.a: -; CHECK-NEXT: br i1 %arg.inv, label %loop.b, label %Flow +; CHECK-NEXT: br i1 [[ARG_INV]], label [[LOOP_B:%.*]], label [[FLOW:%.*]] ; CHECK: loop.b: -; CHECK-NEXT: br label %Flow +; CHECK-NEXT: br label [[FLOW]] ; CHECK: Flow1: -; CHECK-NEXT: %0 = phi i32 [ %ctr.next, %Flow ], [ undef, %loop ] -; CHECK-NEXT: %1 = phi i1 [ false, %Flow ], [ true, %loop ] -; CHECK-NEXT: br i1 %1, label %exit, label %loop +; CHECK-NEXT: [[TMP0:%.*]] = phi i1 [ false, [[FLOW]] ], [ true, [[LOOP]] ] +; CHECK-NEXT: br i1 [[TMP0]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll b/llvm/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll index 6f1c9a833804b..1389b12cdc53a 100644 --- a/llvm/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll +++ b/llvm/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll @@ -16,7 +16,6 @@ define void @blam(ptr addrspace(1) nocapture %arg, float %arg1, float %arg2) { ; CHECK-NEXT: [[TMP6:%.*]] = fcmp uge float 0.000000e+00, [[ARG2:%.*]] ; CHECK-NEXT: br label [[FLOW]] ; CHECK: Flow: -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 15, [[BB5]] ], [ undef, [[BB3]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP6]], [[BB5]] ], [ [[TMP4]], [[BB3]] ] ; CHECK-NEXT: br i1 [[TMP1]], label [[BB7:%.*]], label [[FLOW1]] ; CHECK: bb7: @@ -24,10 +23,10 @@ define void @blam(ptr addrspace(1) nocapture %arg, float %arg1, float %arg2) { ; CHECK-NEXT: [[TMP9:%.*]] = icmp sge i64 [[TMP8]], 5 ; CHECK-NEXT: br label [[FLOW1]] ; CHECK: Flow1: -; CHECK-NEXT: [[TMP2]] = phi i64 [ [[TMP8]], [[BB7]] ], [ undef, [[FLOW]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ 255, [[BB7]] ], [ [[TMP0]], [[FLOW]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP9]], [[BB7]] ], [ true, [[FLOW]] ] -; CHECK-NEXT: br i1 [[TMP4]], label [[BB10:%.*]], label [[BB3]] +; CHECK-NEXT: [[TMP2]] = phi i64 [ [[TMP8]], [[BB7]] ], [ poison, [[FLOW]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ 255, [[BB7]] ], [ 15, [[FLOW]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP9]], [[BB7]] ], [ true, [[FLOW]] ] +; CHECK-NEXT: br i1 [[TMP5]], label [[BB10:%.*]], label [[BB3]] ; CHECK: bb10: ; CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARG:%.*]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/StructurizeCFG/simple-structurizecfg-crash.ll b/llvm/test/Transforms/StructurizeCFG/simple-structurizecfg-crash.ll new file mode 100644 index 0000000000000..691f43bdcf948 --- /dev/null +++ b/llvm/test/Transforms/StructurizeCFG/simple-structurizecfg-crash.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s + +; Issue tracking: https://github.com/llvm/llvm-project/issues/126534. + +define void @foo() { +; CHECK-LABEL: define void @foo() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[COND_FALSE:.*]] +; CHECK: [[COND_TRUE:.*]]: +; CHECK-NEXT: br label %[[COND_END:.*]] +; CHECK: [[COND_FALSE]]: +; CHECK-NEXT: br i1 false, label %[[COND_TRUE]], label %[[COND_END]] +; CHECK: [[COND_END]]: +; CHECK-NEXT: ret void +; +entry: + br i1 false, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + ret void +} diff --git a/llvm/test/Transforms/StructurizeCFG/structurizecfg-debug-loc.ll b/llvm/test/Transforms/StructurizeCFG/structurizecfg-debug-loc.ll index d5ee7693032e4..8a79526ad3412 100644 --- a/llvm/test/Transforms/StructurizeCFG/structurizecfg-debug-loc.ll +++ b/llvm/test/Transforms/StructurizeCFG/structurizecfg-debug-loc.ll @@ -5,7 +5,7 @@ define void @if_then_else(ptr addrspace(1) %out, i1 %arg) !dbg !7 { ; CHECK: entry: ; CHECK: br i1 {{.*}}, label %if.else, label %Flow, !dbg [[ITE_ENTRY_DL:![0-9]+]] ; CHECK: Flow: -; CHECK: br i1 {{.*}}, label %if.then, label %exit, !dbg [[ITE_ENTRY_DL]] +; CHECK: br i1 {{.*}}, label %if.then, label %exit ; CHECK: if.then: ; CHECK: br label %exit, !dbg [[ITE_IFTHEN_DL:![0-9]+]] ; CHECK: if.else: @@ -36,7 +36,7 @@ define void @while_loop(ptr addrspace(1) %out) !dbg !14 { ; CHECK: while.body: ; CHECK: br label %Flow, !dbg [[WHILE_BODY_DL:![0-9]+]] ; CHECK: Flow: -; CHECK: br i1 {{.*}}, label %exit, label %while.header, !dbg [[WHILE_HEADER_DL]] +; CHECK: br i1 {{.*}}, label %exit, label %while.header ; CHECK: exit: ; entry: @@ -63,7 +63,7 @@ define void @while_multiple_exits(ptr addrspace(1) %out) !dbg !21 { ; CHECK: while.exiting: ; CHECK: br label %Flow, !dbg [[WHILEME_EXITING_DL:![0-9]+]] ; CHECK: Flow: -; CHECK: br i1 {{.*}}, label %exit, label %while.header, !dbg [[WHILEME_HEADER_DL]] +; CHECK: br i1 {{.*}}, label %exit, label %while.header ; CHECK: exit: ; entry: @@ -86,11 +86,11 @@ define void @nested_if_then_else(ptr addrspace(1) %out, i1 %a, i1 %b) !dbg !28 { ; CHECK: entry: ; CHECK: br i1 {{.*}}, label %if.else, label %Flow4, !dbg [[NESTED_ENTRY_DL:![0-9]+]] ; CHECK: Flow4: -; CHECK: br i1 {{.*}}, label %if.then, label %exit, !dbg [[NESTED_ENTRY_DL]] +; CHECK: br i1 {{.*}}, label %if.then, label %exit ; CHECK: if.then: ; CHECK: br i1 {{.*}}, label %if.then.else, label %Flow2, !dbg [[NESTED_IFTHEN_DL:![0-9]+]] ; CHECK: Flow2: -; CHECK: br i1 {{.*}}, label %if.then.then, label %Flow3, !dbg [[NESTED_IFTHEN_DL]] +; CHECK: br i1 {{.*}}, label %if.then.then, label %Flow3 ; CHECK: if.then.then: ; CHECK: br label %Flow3, !dbg [[NESTED_IFTHENTHEN_DL:![0-9]+]] ; CHECK: if.then.else: @@ -98,15 +98,15 @@ define void @nested_if_then_else(ptr addrspace(1) %out, i1 %a, i1 %b) !dbg !28 { ; CHECK: if.else: ; CHECK: br i1 {{.*}}, label %if.else.else, label %Flow, !dbg [[NESTED_IFELSE_DL:![0-9]+]] ; CHECK: Flow: -; CHECK: br i1 {{.*}}, label %if.else.then, label %Flow1, !dbg [[NESTED_IFELSE_DL]] +; CHECK: br i1 {{.*}}, label %if.else.then, label %Flow1 ; CHECK: if.else.then: ; CHECK: br label %Flow1, !dbg [[NESTED_IFELSETHEN_DL:![0-9]+]] ; CHECK: if.else.else: ; CHECK: br label %Flow, !dbg [[NESTED_IFELSEELSE_DL:![0-9]+]] ; CHECK: Flow1: -; CHECK: br label %Flow4, !dbg [[NESTED_IFELSE_DL]] +; CHECK: br label %Flow4 ; CHECK: Flow3: -; CHECK: br label %exit, !dbg [[NESTED_IFTHEN_DL]] +; CHECK: br label %exit ; CHECK: exit: ; entry: diff --git a/llvm/test/Transforms/StructurizeCFG/structurizer-keep-perf-md.ll b/llvm/test/Transforms/StructurizeCFG/structurizer-keep-perf-md.ll index cdf5ca569701b..583c97852ff6d 100644 --- a/llvm/test/Transforms/StructurizeCFG/structurizer-keep-perf-md.ll +++ b/llvm/test/Transforms/StructurizeCFG/structurizer-keep-perf-md.ll @@ -7,15 +7,14 @@ define amdgpu_ps i32 @if_else(i32 %0) { ; OPT-NEXT: [[C:%.*]] = icmp ne i32 [[TMP0]], 0 ; OPT-NEXT: br i1 [[C]], label %[[FALSE:.*]], label %[[FLOW:.*]], !prof [[PROF0:![0-9]+]] ; OPT: [[FLOW]]: -; OPT-NEXT: [[TMP2:%.*]] = phi i32 [ 33, %[[FALSE]] ], [ undef, [[TMP1:%.*]] ] -; OPT-NEXT: [[TMP3:%.*]] = phi i1 [ false, %[[FALSE]] ], [ true, [[TMP1]] ] -; OPT-NEXT: br i1 [[TMP3]], label %[[TRUE:.*]], label %[[EXIT:.*]] +; OPT-NEXT: [[TMP2:%.*]] = phi i1 [ false, %[[FALSE]] ], [ true, [[TMP1:%.*]] ] +; OPT-NEXT: br i1 [[TMP2]], label %[[TRUE:.*]], label %[[EXIT:.*]] ; OPT: [[TRUE]]: ; OPT-NEXT: br label %[[EXIT]] ; OPT: [[FALSE]]: ; OPT-NEXT: br label %[[FLOW]] ; OPT: [[EXIT]]: -; OPT-NEXT: [[RET:%.*]] = phi i32 [ [[TMP2]], %[[FLOW]] ], [ 42, %[[TRUE]] ] +; OPT-NEXT: [[RET:%.*]] = phi i32 [ 33, %[[FLOW]] ], [ 42, %[[TRUE]] ] ; OPT-NEXT: ret i32 [[RET]] ; %c = icmp eq i32 %0, 0 @@ -45,7 +44,7 @@ define amdgpu_ps void @loop_if_break(i32 %n) { ; OPT-NEXT: [[I_NEXT:%.*]] = sub i32 [[I]], 1 ; OPT-NEXT: br label %[[FLOW]] ; OPT: [[FLOW]]: -; OPT-NEXT: [[TMP0]] = phi i32 [ [[I_NEXT]], %[[LOOP_BODY]] ], [ undef, %[[LOOP]] ] +; OPT-NEXT: [[TMP0]] = phi i32 [ [[I_NEXT]], %[[LOOP_BODY]] ], [ poison, %[[LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[LOOP_BODY]] ], [ true, %[[LOOP]] ] ; OPT-NEXT: br i1 [[TMP1]], label %[[EXIT:.*]], label %[[LOOP]] ; OPT: [[EXIT]]: diff --git a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fix-reducible.ll b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fix-reducible.ll index d09d7454793bb..8fbeb6e811805 100644 --- a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fix-reducible.ll +++ b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fix-reducible.ll @@ -12,22 +12,20 @@ define void @irreducible(i1 %PredEntry, i1 %PredB1, i1 %PredB2, i1 %PredB3, i1 % ; CHECK-NEXT: [[PREDENTRY_INV:%.*]] = xor i1 [[PREDENTRY:%.*]], true ; CHECK-NEXT: br label [[IRR_GUARD:%.*]] ; CHECK: Flow4: -; CHECK-NEXT: [[TMP0:%.*]] = phi i1 [ true, [[B3:%.*]] ], [ undef, [[B2:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ [[PREDB3:%.*]], [[B3]] ], [ true, [[B2]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ [[PREDB3:%.*]], [[B3:%.*]] ], [ true, [[B2:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ [[PREDB3]], [[B3]] ], [ false, [[B2]] ] ; CHECK-NEXT: br label [[FLOW3:%.*]] ; CHECK: B1: ; CHECK-NEXT: br label [[FLOW5:%.*]] ; CHECK: Flow2: -; CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ [[TMP9:%.*]], [[FLOW5]] ], [ [[TMP11:%.*]], [[FLOW:%.*]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP10:%.*]], [[FLOW5]] ], [ [[TMP12:%.*]], [[FLOW]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ true, [[FLOW5]] ], [ [[TMP9:%.*]], [[FLOW:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ true, [[FLOW5]] ], [ [[TMP12:%.*]], [[FLOW]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP7:%.*]], [[FLOW5]] ], [ true, [[FLOW]] ] ; CHECK-NEXT: br i1 true, label [[FLOW6:%.*]], label [[FLOW]] ; CHECK: B2: ; CHECK-NEXT: br i1 [[PREDB2_INV]], label [[B3]], label [[FLOW4:%.*]] ; CHECK: Flow3: -; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ [[TMP0]], [[FLOW4]] ], [ undef, [[IRR_GUARD1:%.*]] ] -; CHECK-NEXT: [[TMP7]] = phi i1 [ [[TMP1]], [[FLOW4]] ], [ true, [[IRR_GUARD1]] ] +; CHECK-NEXT: [[TMP7]] = phi i1 [ [[TMP1]], [[FLOW4]] ], [ true, [[IRR_GUARD1:%.*]] ] ; CHECK-NEXT: [[TMP8:%.*]] = phi i1 [ [[TMP2]], [[FLOW4]] ], [ true, [[IRR_GUARD1]] ] ; CHECK-NEXT: br i1 [[TMP8]], label [[B1:%.*]], label [[FLOW5]] ; CHECK: B3: @@ -35,8 +33,6 @@ define void @irreducible(i1 %PredEntry, i1 %PredB1, i1 %PredB2, i1 %PredB3, i1 % ; CHECK: B4: ; CHECK-NEXT: br label [[FLOW]] ; CHECK: Flow5: -; CHECK-NEXT: [[TMP9]] = phi i1 [ undef, [[B1]] ], [ [[TMP6]], [[FLOW3]] ] -; CHECK-NEXT: [[TMP10]] = phi i1 [ true, [[B1]] ], [ undef, [[FLOW3]] ] ; CHECK-NEXT: br label [[FLOW2:%.*]] ; CHECK: Flow6: ; CHECK-NEXT: br i1 [[TMP5]], label [[EXIT:%.*]], label [[IRR_GUARD]] @@ -46,7 +42,7 @@ define void @irreducible(i1 %PredEntry, i1 %PredB1, i1 %PredB2, i1 %PredB3, i1 % ; CHECK-NEXT: [[GUARD_B4:%.*]] = phi i1 [ [[PREDENTRY_INV]], [[ENTRY:%.*]] ], [ [[TMP3]], [[FLOW6]] ] ; CHECK-NEXT: br i1 [[GUARD_B4]], label [[B4:%.*]], label [[FLOW]] ; CHECK: Flow: -; CHECK-NEXT: [[TMP11]] = phi i1 [ [[TMP3]], [[FLOW2]] ], [ undef, [[B4]] ], [ undef, [[IRR_GUARD]] ] +; CHECK-NEXT: [[TMP9]] = phi i1 [ [[TMP3]], [[FLOW2]] ], [ poison, [[B4]] ], [ poison, [[IRR_GUARD]] ] ; CHECK-NEXT: [[TMP12]] = phi i1 [ [[TMP4]], [[FLOW2]] ], [ true, [[B4]] ], [ false, [[IRR_GUARD]] ] ; CHECK-NEXT: [[TMP13:%.*]] = phi i1 [ false, [[FLOW2]] ], [ [[PREDB4:%.*]], [[B4]] ], [ true, [[IRR_GUARD]] ] ; CHECK-NEXT: br i1 [[TMP13]], label [[IRR_GUARD1]], label [[FLOW2]] diff --git a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll index 912beed6b2eed..17e24ea7f6d9a 100644 --- a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll +++ b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll @@ -47,10 +47,7 @@ define void @irreducible_mountain_bug(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3 ; CHECK: while.body63: ; CHECK-NEXT: br i1 [[PRED5_INV]], label [[WHILE_COND47]], label [[FLOW10:%.*]] ; CHECK: Flow9: -; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP3]], [[FLOW10]] ], [ undef, [[IRR_GUARD1:%.*]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ false, [[FLOW10]] ], [ undef, [[IRR_GUARD1]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi i1 [ true, [[FLOW10]] ], [ undef, [[IRR_GUARD1]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi i1 [ true, [[FLOW10]] ], [ undef, [[IRR_GUARD1]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP3]], [[FLOW10]] ], [ poison, [[IRR_GUARD1:%.*]] ] ; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ [[TMP4]], [[FLOW10]] ], [ true, [[IRR_GUARD1]] ] ; CHECK-NEXT: br i1 [[TMP9]], label [[COND_TRUE49:%.*]], label [[FLOW11]] ; CHECK: while.cond47: @@ -58,8 +55,7 @@ define void @irreducible_mountain_bug(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3 ; CHECK: cond.end61: ; CHECK-NEXT: br label [[FLOW12:%.*]] ; CHECK: Flow17: -; CHECK-NEXT: [[TMP10:%.*]] = phi i1 [ [[TMP19:%.*]], [[FLOW18:%.*]] ], [ undef, [[LOOP_EXIT_GUARD2:%.*]] ] -; CHECK-NEXT: [[TMP11:%.*]] = phi i1 [ [[TMP20:%.*]], [[FLOW18]] ], [ [[DOTINV:%.*]], [[LOOP_EXIT_GUARD2]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi i1 [ [[TMP20:%.*]], [[FLOW18:%.*]] ], [ [[DOTINV:%.*]], [[LOOP_EXIT_GUARD2:%.*]] ] ; CHECK-NEXT: br label [[FLOW16:%.*]] ; CHECK: if.then69: ; CHECK-NEXT: br label [[FLOW18]] @@ -92,30 +88,26 @@ define void @irreducible_mountain_bug(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3 ; CHECK: exit: ; CHECK-NEXT: ret void ; CHECK: Flow15: -; CHECK-NEXT: [[TMP12:%.*]] = phi i1 [ true, [[LOR_RHS]] ], [ undef, [[WHILE_COND]] ] ; CHECK-NEXT: [[TMP13:%.*]] = phi i1 [ true, [[LOR_RHS]] ], [ false, [[WHILE_COND]] ] ; CHECK-NEXT: [[TMP14:%.*]] = phi i1 [ [[PRED9:%.*]], [[LOR_RHS]] ], [ true, [[WHILE_COND]] ] ; CHECK-NEXT: br i1 [[TMP14]], label [[IRR_GUARD:%.*]], label [[FLOW16]] ; CHECK: irr.guard: -; CHECK-NEXT: [[GUARD_COND_END61:%.*]] = phi i1 [ [[TMP28:%.*]], [[FLOW13:%.*]] ], [ [[TMP13]], [[FLOW15]] ] +; CHECK-NEXT: [[GUARD_COND_END61:%.*]] = phi i1 [ true, [[FLOW13:%.*]] ], [ [[TMP13]], [[FLOW15]] ] ; CHECK-NEXT: br i1 [[GUARD_COND_END61]], label [[COND_END61:%.*]], label [[FLOW12]] ; CHECK: Flow12: -; CHECK-NEXT: [[TMP15:%.*]] = phi i1 [ false, [[COND_END61]] ], [ undef, [[IRR_GUARD]] ] -; CHECK-NEXT: [[TMP16:%.*]] = phi i1 [ true, [[COND_END61]] ], [ undef, [[IRR_GUARD]] ] ; CHECK-NEXT: [[TMP17:%.*]] = phi i1 [ true, [[COND_END61]] ], [ false, [[IRR_GUARD]] ] ; CHECK-NEXT: [[TMP18:%.*]] = phi i1 [ [[PRED7:%.*]], [[COND_END61]] ], [ true, [[IRR_GUARD]] ] ; CHECK-NEXT: br i1 [[TMP18]], label [[IRR_GUARD1]], label [[FLOW13]] ; CHECK: irr.guard1: -; CHECK-NEXT: [[GUARD_WHILE_BODY63:%.*]] = phi i1 [ [[TMP23:%.*]], [[FLOW11]] ], [ [[TMP17]], [[FLOW12]] ] +; CHECK-NEXT: [[GUARD_WHILE_BODY63:%.*]] = phi i1 [ true, [[FLOW11]] ], [ [[TMP17]], [[FLOW12]] ] ; CHECK-NEXT: br i1 [[GUARD_WHILE_BODY63]], label [[WHILE_BODY63]], label [[FLOW9]] ; CHECK: Flow18: -; CHECK-NEXT: [[TMP19]] = phi i1 [ false, [[IF_THEN69:%.*]] ], [ [[TMP31:%.*]], [[LOOP_EXIT_GUARD3:%.*]] ] -; CHECK-NEXT: [[TMP20]] = phi i1 [ [[PRED8:%.*]], [[IF_THEN69]] ], [ [[DOTINV]], [[LOOP_EXIT_GUARD3]] ] +; CHECK-NEXT: [[TMP20]] = phi i1 [ [[PRED8:%.*]], [[IF_THEN69:%.*]] ], [ [[DOTINV]], [[LOOP_EXIT_GUARD3:%.*]] ] ; CHECK-NEXT: br label [[FLOW17:%.*]] ; CHECK: loop.exit.guard: ; CHECK-NEXT: br i1 [[TMP21:%.*]], label [[WHILE_END76:%.*]], label [[FLOW8]] ; CHECK: Flow16: -; CHECK-NEXT: [[TMP21]] = phi i1 [ [[TMP10]], [[FLOW17]] ], [ [[TMP12]], [[FLOW15]] ] +; CHECK-NEXT: [[TMP21]] = phi i1 [ false, [[FLOW17]] ], [ true, [[FLOW15]] ] ; CHECK-NEXT: [[TMP22:%.*]] = phi i1 [ [[TMP11]], [[FLOW17]] ], [ true, [[FLOW15]] ] ; CHECK-NEXT: br i1 [[TMP22]], label [[LOOP_EXIT_GUARD:%.*]], label [[WHILE_COND]] ; CHECK: loop.exit.guard2: @@ -123,17 +115,13 @@ define void @irreducible_mountain_bug(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3 ; CHECK: loop.exit.guard3: ; CHECK-NEXT: br i1 [[DOTINV14:%.*]], label [[IF_THEN69]], label [[FLOW18]] ; CHECK: Flow11: -; CHECK-NEXT: [[TMP23]] = phi i1 [ true, [[COND_TRUE49]] ], [ undef, [[FLOW9]] ] ; CHECK-NEXT: [[TMP24:%.*]] = phi i1 [ true, [[COND_TRUE49]] ], [ [[TMP5]], [[FLOW9]] ] -; CHECK-NEXT: [[TMP25:%.*]] = phi i1 [ false, [[COND_TRUE49]] ], [ [[TMP6]], [[FLOW9]] ] -; CHECK-NEXT: [[TMP26:%.*]] = phi i1 [ false, [[COND_TRUE49]] ], [ [[TMP7]], [[FLOW9]] ] +; CHECK-NEXT: [[TMP16:%.*]] = phi i1 [ false, [[COND_TRUE49]] ], [ true, [[FLOW9]] ] ; CHECK-NEXT: [[TMP27:%.*]] = phi i1 [ [[PRED4:%.*]], [[COND_TRUE49]] ], [ true, [[FLOW9]] ] ; CHECK-NEXT: br i1 [[TMP27]], label [[LOOP_EXIT_GUARD4:%.*]], label [[IRR_GUARD1]] ; CHECK: Flow13: -; CHECK-NEXT: [[TMP28]] = phi i1 [ [[TMP8]], [[LOOP_EXIT_GUARD4]] ], [ undef, [[FLOW12]] ] -; CHECK-NEXT: [[TMP29:%.*]] = phi i1 [ [[TMP26]], [[LOOP_EXIT_GUARD4]] ], [ [[TMP15]], [[FLOW12]] ] -; CHECK-NEXT: [[TMP30:%.*]] = phi i1 [ [[TMP25]], [[LOOP_EXIT_GUARD4]] ], [ [[TMP16]], [[FLOW12]] ] -; CHECK-NEXT: [[TMP31]] = phi i1 [ [[TMP6]], [[LOOP_EXIT_GUARD4]] ], [ undef, [[FLOW12]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi i1 [ [[TMP16]], [[LOOP_EXIT_GUARD4]] ], [ false, [[FLOW12]] ] +; CHECK-NEXT: [[TMP30:%.*]] = phi i1 [ false, [[LOOP_EXIT_GUARD4]] ], [ true, [[FLOW12]] ] ; CHECK-NEXT: [[TMP32:%.*]] = phi i1 [ [[TMP24]], [[LOOP_EXIT_GUARD4]] ], [ true, [[FLOW12]] ] ; CHECK-NEXT: [[DOTINV14]] = xor i1 [[TMP29]], true ; CHECK-NEXT: [[DOTINV]] = xor i1 [[TMP30]], true diff --git a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll index 6f6fc4d0f4e64..ff989a655d54a 100644 --- a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll +++ b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll @@ -28,7 +28,6 @@ define void @exiting-block(i1 %PredH1, i1 %PredB2, i1 %PredB1, i1 %PredH2) { ; CHECK: H1: ; CHECK-NEXT: br i1 [[PREDH1_INV]], label [[B1:%.*]], label [[FLOW3:%.*]] ; CHECK: Flow3: -; CHECK-NEXT: [[TMP0:%.*]] = phi i1 [ true, [[B1]] ], [ undef, [[H1]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ [[PREDB1:%.*]], [[B1]] ], [ [[PREDH1]], [[H1]] ] ; CHECK-NEXT: br i1 [[TMP1]], label [[H2:%.*]], label [[FLOW4:%.*]] ; CHECK: H2: @@ -58,7 +57,7 @@ define void @exiting-block(i1 %PredH1, i1 %PredB2, i1 %PredB1, i1 %PredH2) { ; CHECK-NEXT: [[TMP5]] = phi i1 [ false, [[L2]] ], [ true, [[B2]] ] ; CHECK-NEXT: br label [[FLOW]] ; CHECK: Flow4: -; CHECK-NEXT: [[TMP6]] = phi i1 [ false, [[FLOW5]] ], [ [[TMP0]], [[FLOW3]] ] +; CHECK-NEXT: [[TMP6]] = phi i1 [ false, [[FLOW5]] ], [ true, [[FLOW3]] ] ; CHECK-NEXT: [[TMP7:%.*]] = phi i1 [ [[TMP4]], [[FLOW5]] ], [ true, [[FLOW3]] ] ; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_EXIT_GUARD:%.*]], label [[H1]] ; CHECK: loop.exit.guard1: diff --git a/llvm/test/tools/llvm-objcopy/ELF/dump-offload-bundle.test b/llvm/test/tools/llvm-objcopy/ELF/dump-offload-bundle.test new file mode 100644 index 0000000000000..518cdace8e29c --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/ELF/dump-offload-bundle.test @@ -0,0 +1,60 @@ +## Test that --offloading with a fatbin works correctly +# REQUIRES: target={{x86_64-.*-linux.*}} +# REQUIRES: amdgpu-registered-target + +# RUN: yaml2obj %s -o %t.elf +# RUN: llvm-objcopy --dump-offload-bundle=file://%t.elf#offset=8192\&size=4048 +# RUN: llvm-objdump -d %t.elf-offset8192-size4048.co | FileCheck %s + +# CHECK: s_load_dword s7, s[4:5], 0x24 // 000000001900: C00201C2 00000024 +# CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 // 000000001908: C00A0002 00000000 +# CHECK-NEXT: v_mov_b32_e32 v1, 0 // 000000001910: 7E020280 +# CHECK-NEXT: s_waitcnt lgkmcnt(0) // 000000001914: BF8CC07F +# CHECK-NEXT: s_and_b32 s4, s7, 0xffff // 000000001918: 8604FF07 0000FFFF +# CHECK-NEXT: s_mul_i32 s6, s6, s4 // 000000001920: 92060406 +# CHECK-NEXT: v_add_u32_e32 v0, s6, v0 // 000000001924: 68000006 +# CHECK-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] // 000000001928: D28F0000 00020082 +# CHECK-NEXT: v_mov_b32_e32 v3, s3 // 000000001930: 7E060203 +# CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 // 000000001934: 32040002 +# CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc // 000000001938: 38060303 +# CHECK-NEXT: global_load_dword v2, v[2:3], off // 00000000193C: DC508000 027F0002 +# CHECK-NEXT: v_mov_b32_e32 v3, s1 // 000000001944: 7E060201 +# CHECK-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 // 000000001948: 32000000 +# CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc // 00000000194C: 38020303 +# CHECK-NEXT: global_load_dword v3, v[0:1], off // 000000001950: DC508000 037F0000 +# CHECK-NEXT: s_waitcnt vmcnt(0) // 000000001958: BF8C0F70 +# CHECK-NEXT: v_add_u32_e32 v2, v3, v2 // 00000000195C: 68040503 +# CHECK-NEXT: global_store_dword v[0:1], v2, off // 000000001960: DC708000 007F0200 +# CHECK-NEXT: s_endpgm // 000000001968: BF810000 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 + Entry: 0x2041B0 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x200040 + Align: 0x8 + Offset: 0x40 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Align: 0x0 + Offset: 0x0 +Sections: + - Name: .hip_fatbin + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x201000 + AddressAlign: 0x1000 + Content: 5F5F434C414E475F4F46464C4F41445F42554E444C455F5F0200000000000000001000000000000000000000000000001B00000000000000686F73742D7838365F36342D756E6B6E6F776E2D6C696E75782D2D0010000000000000D00F0000000000001F0000000000000068697076342D616D6467636E2D616D642D616D646873612D2D676678393038000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000007F454C460201014003000000000000000300E0000100000000000000000000004000000000000000100C0000000000003005000040003800090040000F000D000600000004000000400000000000000040000000000000004000000000000000F801000000000000F80100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000C008000000000000C008000000000000001000000000000001000000050000000009000000000000001900000000000000190000000000006C000000000000006C00000000000000001000000000000001000000060000007009000000000000702900000000000070290000000000007000000000000000900600000000000000100000000000000100000006000000E009000000000000E039000000000000E039000000000000000000000000000001000000000000000010000000000000020000000600000070090000000000007029000000000000702900000000000070000000000000007000000000000000080000000000000052E574640400000070090000000000007029000000000000702900000000000070000000000000009006000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000400000004000000380200000000000038020000000000003802000000000000340500000000000034050000000000000400000000000000070000001D05000020000000414D44475055000083AE616D646873612E6B65726E656C7391DE0012AB2E616770725F636F756E7400A52E61726773DC001085AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA415F642E636F65726365A72E6F666673657400A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657285AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA425F642E636F65726365A72E6F666673657408A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657284A52E6E616D65A14EA72E6F666673657410A52E73697A6508AB2E76616C75655F6B696E64A862795F76616C756583A72E6F666673657418A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7883A72E6F66667365741CA52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7983A72E6F666673657420A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7A83A72E6F666673657424A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7883A72E6F666673657426A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7983A72E6F666673657428A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7A83A72E6F66667365742AA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7883A72E6F66667365742CA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7983A72E6F66667365742EA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7A83A72E6F666673657440A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7883A72E6F666673657448A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7983A72E6F666673657450A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7A83A72E6F666673657458A52E73697A6502AB2E76616C75655F6B696E64B068696464656E5F677269645F64696D73B92E67726F75705F7365676D656E745F66697865645F73697A6500B62E6B65726E6172675F7365676D656E745F616C69676E08B52E6B65726E6172675F7365676D656E745F73697A65CD0118A92E6C616E6775616765A84F70656E434C2043B12E6C616E67756167655F76657273696F6E920200B82E6D61785F666C61745F776F726B67726F75705F73697A65CD0400A52E6E616D65B25F5A3973696D706C65416464506A504B6A6DBB2E707269766174655F7365676D656E745F66697865645F73697A6500AB2E736770725F636F756E740CB12E736770725F7370696C6C5F636F756E7400A72E73796D626F6CB55F5A3973696D706C65416464506A504B6A6D2E6B64B82E756E69666F726D5F776F726B5F67726F75705F73697A6501B32E757365735F64796E616D69635F737461636BC2AB2E766770725F636F756E7404B12E766770725F7370696C6C5F636F756E7400AF2E7761766566726F6E745F73697A6540AD616D646873612E746172676574B9616D6467636E2D616D642D616D646873612D2D676678393038AE616D646873612E76657273696F6E92010200000000000000000000000000000000000000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E03900000000000001000000000000000100000001000000010000001A000000000008400000D20001000000360A4A7A5238A4D3F113F4DD04000000040000000200000001000000000000000300000000000000000000000000000000000000005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F623730363264386333326134613933330000000000000000000000000000000000000000000000000000000000000000000000180100000000000080100000000000000000000000000000000000000000000000000000000000004000AF008C000000090000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000C20102C02400000002000AC0000000008002027E7FC08CBF07FF0486FFFF0000060406920600006800008FD2820002000302067E0200043203030638008050DC02007F020102067E0000003203030238008050DC00007F03700F8CBF03050468008070DC00027F00000081BF00000000060000000000000070070000000000000B000000000000001800000000000000050000000000000020080000000000000A000000000000004600000000000000F5FEFF6F00000000D0070000000000000400000000000000F807000000000000000000000000000000000000000000004C696E6B65723A20414D44204C4C442031392E302E3000414D4420636C616E672076657273696F6E2031392E302E306769742028202032343231322063393630313665636534313337356462646438663037356266333762643666633333323230376233290000414D4420636C616E672076657273696F6E2031382E302E3067697420287373683A2F2F6765727269746769742F6C696768746E696E672F65632F6C6C766D2D70726F6A65637420616D642D6D61696E6C696E652D6F70656E20323431373620663935303039613166393032313232343865313036333964653837653635636163616338643961372900000000000000000000000000000000000000000000000000460000000002080070290000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E0390000000000000100000000000000002E6E6F7465002E64796E73796D002E676E752E68617368002E68617368002E64796E737472002E726F64617461002E74657874002E64796E616D6963002E72656C726F5F70616464696E67002E627373002E636F6D6D656E74002E73796D746162002E7368737472746162002E73747274616200005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F62373036326438633332613461393333005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000070000000200000000000000380200000000000038020000000000003405000000000000000000000000000004000000000000000000000000000000070000000B00000002000000000000007007000000000000700700000000000060000000000000000500000001000000080000000000000018000000000000000F000000F6FFFF6F0200000000000000D007000000000000D007000000000000280000000000000002000000000000000800000000000000000000000000000019000000050000000200000000000000F807000000000000F80700000000000028000000000000000200000000000000040000000000000004000000000000001F000000030000000200000000000000200800000000000020080000000000004600000000000000000000000000000001000000000000000000000000000000270000000100000002000000000000008008000000000000800800000000000040000000000000000000000000000000400000000000000000000000000000002F000000010000000600000000000000001900000000000000090000000000006C00000000000000000000000000000000010000000000000000000000000000350000000600000003000000000000007029000000000000700900000000000070000000000000000500000000000000080000000000000010000000000000003E000000080000000300000000000000E029000000000000E00900000000000020060000000000000000000000000000010000000000000000000000000000004D000000080000000300000000000000E039000000000000E0090000000000000100000000000000000000000000000001000000000000000000000000000000520000000100000030000000000000000000000000000000E009000000000000F0000000000000000000000000000000010000000000000001000000000000005B0000000200000000000000000000000000000000000000D00A00000000000078000000000000000E0000000200000008000000000000001800000000000000630000000300000000000000000000000000000000000000480B00000000000075000000000000000000000000000000010000000000000000000000000000006D0000000300000000000000000000000000000000000000BD0B0000000000004F00000000000000000000000000000001000000000000000000000000000000 + - Name: .hipFatBinSegment + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x202FD0 + AddressAlign: 0x8 + Content: '465049480100000000102000000000000000000000000000' +... diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s index 81d0d868ab918..2c212b92381a6 100644 --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s @@ -27,7 +27,7 @@ ; CHECK-NEXT: .amdhsa_fp16_overflow 0 ; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1 ; CHECK-NEXT: .amdhsa_memory_ordered 1 -; CHECK-NEXT: .amdhsa_forward_progress 0 +; CHECK-NEXT: .amdhsa_forward_progress 1 ; CHECK-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 @@ -83,7 +83,7 @@ ; CHECK-NEXT: .amdhsa_fp16_overflow 0 ; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1 ; CHECK-NEXT: .amdhsa_memory_ordered 1 -; CHECK-NEXT: .amdhsa_forward_progress 0 +; CHECK-NEXT: .amdhsa_forward_progress 1 ; CHECK-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 @@ -139,7 +139,7 @@ ; CHECK-NEXT: .amdhsa_fp16_overflow 0 ; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1 ; CHECK-NEXT: .amdhsa_memory_ordered 1 -; CHECK-NEXT: .amdhsa_forward_progress 0 +; CHECK-NEXT: .amdhsa_forward_progress 1 ; CHECK-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 @@ -195,7 +195,7 @@ ; CHECK-NEXT: .amdhsa_fp16_overflow 0 ; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1 ; CHECK-NEXT: .amdhsa_memory_ordered 1 -; CHECK-NEXT: .amdhsa_forward_progress 0 +; CHECK-NEXT: .amdhsa_forward_progress 1 ; CHECK-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx11.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx11.s index 750809128189f..4fea3b422472b 100644 --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx11.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx11.s @@ -30,7 +30,7 @@ ; CHECK-NEXT: .amdhsa_fp16_overflow 0 ; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1 ; CHECK-NEXT: .amdhsa_memory_ordered 1 -; CHECK-NEXT: .amdhsa_forward_progress 0 +; CHECK-NEXT: .amdhsa_forward_progress 1 ; CHECK-NEXT: .amdhsa_enable_private_segment 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 @@ -87,7 +87,7 @@ ; CHECK-NEXT: .amdhsa_fp16_overflow 0 ; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1 ; CHECK-NEXT: .amdhsa_memory_ordered 1 -; CHECK-NEXT: .amdhsa_forward_progress 0 +; CHECK-NEXT: .amdhsa_forward_progress 1 ; CHECK-NEXT: .amdhsa_enable_private_segment 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 @@ -144,7 +144,7 @@ ; CHECK-NEXT: .amdhsa_fp16_overflow 0 ; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1 ; CHECK-NEXT: .amdhsa_memory_ordered 1 -; CHECK-NEXT: .amdhsa_forward_progress 0 +; CHECK-NEXT: .amdhsa_forward_progress 1 ; CHECK-NEXT: .amdhsa_enable_private_segment 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 @@ -201,7 +201,7 @@ ; CHECK-NEXT: .amdhsa_fp16_overflow 0 ; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1 ; CHECK-NEXT: .amdhsa_memory_ordered 1 -; CHECK-NEXT: .amdhsa_forward_progress 0 +; CHECK-NEXT: .amdhsa_forward_progress 1 ; CHECK-NEXT: .amdhsa_enable_private_segment 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s index c644e15efc8d7..942b927df9264 100644 --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s @@ -26,7 +26,7 @@ ; CHECK-NEXT: .amdhsa_fp16_overflow 0 ; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1 ; CHECK-NEXT: .amdhsa_memory_ordered 1 -; CHECK-NEXT: .amdhsa_forward_progress 0 +; CHECK-NEXT: .amdhsa_forward_progress 1 ; CHECK-NEXT: .amdhsa_round_robin_scheduling 0 ; CHECK-NEXT: .amdhsa_enable_private_segment 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 @@ -80,7 +80,7 @@ ; CHECK-NEXT: .amdhsa_fp16_overflow 0 ; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1 ; CHECK-NEXT: .amdhsa_memory_ordered 1 -; CHECK-NEXT: .amdhsa_forward_progress 0 +; CHECK-NEXT: .amdhsa_forward_progress 1 ; CHECK-NEXT: .amdhsa_round_robin_scheduling 0 ; CHECK-NEXT: .amdhsa_enable_private_segment 0 ; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 diff --git a/llvm/test/tools/llvm-objdump/Offloading/fatbin.test b/llvm/test/tools/llvm-objdump/Offloading/fatbin.test index 6530ed26c807d..40cb26896cd86 100644 --- a/llvm/test/tools/llvm-objdump/Offloading/fatbin.test +++ b/llvm/test/tools/llvm-objdump/Offloading/fatbin.test @@ -1,9 +1,9 @@ -## Test that --offload-fatbin works correctly +## Test that --offloading with a fatbin works correctly -# REQUIRES: target={{.*-linux.*}} +# REQUIRES: target={{x86_64-.*-linux.*}} # RUN: yaml2obj %s -o %t.elf -# RUN: llvm-objdump --offload-fatbin %t.elf -# RUN: llvm-objdump -d %t.elf:0.hipv4-amdgcn-amd-amdhsa--gfx908 | FileCheck %s +# RUN: llvm-objdump --offloading %t.elf +# RUN: llvm-objdump -d %t.elf.0.hipv4-amdgcn-amd-amdhsa--gfx908 | FileCheck %s # CHECK: s_load_dword s7, s[4:5], 0x24 @@ -40,289 +40,11 @@ ProgramHeaders: VAddr: 0x200040 Align: 0x8 Offset: 0x40 - - Type: PT_INTERP - Flags: [ PF_R ] - FirstSec: .interp - LastSec: .interp - VAddr: 0x2002A8 - Offset: 0x2A8 - - Type: PT_LOAD - Flags: [ PF_R ] - FirstSec: .interp - LastSec: .eh_frame - VAddr: 0x200000 - Align: 0x1000 - Offset: 0x0 - - Type: PT_LOAD - Flags: [ PF_X, PF_R ] - FirstSec: .text - LastSec: .plt - VAddr: 0x2041B0 - Align: 0x1000 - Offset: 0x31B0 - - Type: PT_LOAD - Flags: [ PF_W, PF_R ] - FirstSec: .init_array - LastSec: .relro_padding - VAddr: 0x205AE0 - Align: 0x1000 - Offset: 0x3AE0 - - Type: PT_LOAD - Flags: [ PF_W, PF_R ] - FirstSec: .data - LastSec: .bss - VAddr: 0x206CF8 - Align: 0x1000 - Offset: 0x3CF8 - - Type: PT_DYNAMIC - Flags: [ PF_W, PF_R ] - FirstSec: .dynamic - LastSec: .dynamic - VAddr: 0x205B00 - Align: 0x8 - Offset: 0x3B00 - - Type: PT_GNU_RELRO - Flags: [ PF_R ] - FirstSec: .init_array - LastSec: .relro_padding - VAddr: 0x205AE0 - Offset: 0x3AE0 - - Type: PT_GNU_EH_FRAME - Flags: [ PF_R ] - FirstSec: .eh_frame_hdr - LastSec: .eh_frame_hdr - VAddr: 0x202FE8 - Align: 0x4 - Offset: 0x2FE8 - Type: PT_GNU_STACK Flags: [ PF_W, PF_R ] Align: 0x0 Offset: 0x0 - - Type: PT_NOTE - Flags: [ PF_R ] - FirstSec: .note.ABI-tag - LastSec: .note.ABI-tag - VAddr: 0x2002C4 - Align: 0x4 - Offset: 0x2C4 Sections: - - Name: .interp - Type: SHT_PROGBITS - Flags: [ SHF_ALLOC ] - Address: 0x2002A8 - AddressAlign: 0x1 - Content: 2F6C696236342F6C642D6C696E75782D7838362D36342E736F2E3200 - - Name: .note.ABI-tag - Type: SHT_NOTE - Flags: [ SHF_ALLOC ] - Address: 0x2002C4 - AddressAlign: 0x4 - Notes: - - Name: GNU - Desc: '00000000030000000200000000000000' - Type: NT_VERSION - - Name: .dynsym - Type: SHT_DYNSYM - Flags: [ SHF_ALLOC ] - Address: 0x2002E8 - Link: .dynstr - AddressAlign: 0x8 - - Name: .gnu.version - Type: SHT_GNU_versym - Flags: [ SHF_ALLOC ] - Address: 0x2005D0 - Link: .dynsym - AddressAlign: 0x2 - Entries: [ 0, 2, 1, 3, 4, 3, 5, 5, 5, 7, 6, 8, 6, 6, 9, 4, - 5, 4, 5, 5, 5, 7, 4, 6, 6, 4, 5, 5, 5, 6, 6 ] - - Name: .gnu.version_r - Type: SHT_GNU_verneed - Flags: [ SHF_ALLOC ] - Address: 0x200610 - Link: .dynstr - AddressAlign: 0x4 - Dependencies: - - Version: 1 - File: libamdhip64.so.6 - Entries: - - Name: hip_4.2 - Hash: 252061554 - Flags: 0 - Other: 5 - - Name: hip_6.0 - Hash: 252062064 - Flags: 0 - Other: 9 - - Version: 1 - File: 'libstdc++.so.6' - Entries: - - Name: GLIBCXX_3.4 - Hash: 143796596 - Flags: 0 - Other: 6 - - Name: GLIBCXX_3.4.9 - Hash: 36274057 - Flags: 0 - Other: 7 - - Name: GLIBCXX_3.4.11 - Hash: 43513953 - Flags: 0 - Other: 8 - - Version: 1 - File: libgcc_s.so.1 - Entries: - - Name: GCC_3.0 - Hash: 192489040 - Flags: 0 - Other: 3 - - Version: 1 - File: libc.so.6 - Entries: - - Name: GLIBC_2.2.5 - Hash: 157882997 - Flags: 0 - Other: 4 - - Name: GLIBC_2.34 - Hash: 110530996 - Flags: 0 - Other: 2 - - Name: .gnu.hash - Type: SHT_GNU_HASH - Flags: [ SHF_ALLOC ] - Address: 0x2006D0 - Link: .dynsym - AddressAlign: 0x8 - Header: - SymNdx: 0x1D - Shift2: 0x1A - BloomFilter: [ 0x10000190000 ] - HashBuckets: [ 0x1D ] - HashValues: [ 0x430C9814, 0x4CD54529 ] - - Name: .dynstr - Type: SHT_STRTAB - Flags: [ SHF_ALLOC ] - Address: 0x2006F4 - AddressAlign: 0x1 - - Name: .rela.dyn - Type: SHT_RELA - Flags: [ SHF_ALLOC ] - Address: 0x200A18 - Link: .dynsym - AddressAlign: 0x8 - Relocations: - - Offset: 0x205CD0 - Symbol: __libc_start_main - Type: R_X86_64_GLOB_DAT - - Offset: 0x205CD8 - Symbol: __gmon_start__ - Type: R_X86_64_GLOB_DAT - - Offset: 0x205CE0 - Symbol: __register_frame_info - Type: R_X86_64_GLOB_DAT - - Offset: 0x205CE8 - Symbol: __cxa_finalize - Type: R_X86_64_GLOB_DAT - - Offset: 0x205CF0 - Symbol: __deregister_frame_info - Type: R_X86_64_GLOB_DAT - - Offset: 0x206E80 - Symbol: _ZSt4cout - Type: R_X86_64_COPY - - Name: .rela.plt - Type: SHT_RELA - Flags: [ SHF_ALLOC, SHF_INFO_LINK ] - Address: 0x200AA8 - Link: .dynsym - AddressAlign: 0x8 - Info: .got.plt - Relocations: - - Offset: 0x206D20 - Symbol: __register_frame_info - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D28 - Symbol: __cxa_finalize - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D30 - Symbol: __deregister_frame_info - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D38 - Symbol: __hipPopCallConfiguration - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D40 - Symbol: hipLaunchKernel - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D48 - Symbol: hipGetDeviceCount - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D50 - Symbol: _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D58 - Symbol: _ZNSolsEi - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D60 - Symbol: _ZNKSt5ctypeIcE13_M_widen_initEv - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D68 - Symbol: _ZNSo3putEc - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D70 - Symbol: _ZNSo5flushEv - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D78 - Symbol: hipGetDevicePropertiesR0600 - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D80 - Symbol: strlen - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D88 - Symbol: hipSetDevice - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D90 - Symbol: malloc - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206D98 - Symbol: hipMalloc - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206DA0 - Symbol: hipMemcpy - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206DA8 - Symbol: __hipPushCallConfiguration - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206DB0 - Symbol: _ZNSo9_M_insertImEERSoT_ - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206DB8 - Symbol: free - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206DC0 - Symbol: _ZSt16__throw_bad_castv - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206DC8 - Symbol: _ZNSt8ios_base4InitC1Ev - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206DD0 - Symbol: _ZNSt8ios_base4InitD1Ev - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206DD8 - Symbol: __cxa_atexit - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206DE0 - Symbol: __hipRegisterFatBinary - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206DE8 - Symbol: __hipRegisterFunction - Type: R_X86_64_JUMP_SLOT - - Offset: 0x206DF0 - Symbol: __hipUnregisterFatBinary - Type: R_X86_64_JUMP_SLOT - - Name: .rodata - Type: SHT_PROGBITS - Flags: [ SHF_ALLOC, SHF_MERGE, SHF_STRINGS ] - Address: 0x200D30 - AddressAlign: 0x10 - Content: 010002000000000000000000000000000800000008000000080000000800000000000000010000000200000003000000040000000400000004000000040000000C0000000C0000000C0000000C000000100000001000000010000000100000008042200000000000446576696365200020002048495020646576696365732E004D69736D61746368206F636375726564206174200020213D2000466F756E642000204E616D653A20005F5A3973696D706C65416464506A504B6A6D003A2000 - Name: .hip_fatbin Type: SHT_PROGBITS Flags: [ SHF_ALLOC ] @@ -335,511 +57,4 @@ Sections: Address: 0x202FD0 AddressAlign: 0x8 Content: '465049480100000000102000000000000000000000000000' - - Name: .eh_frame_hdr - Type: SHT_PROGBITS - Flags: [ SHF_ALLOC ] - Address: 0x202FE8 - AddressAlign: 0x4 - Content: 011B033B5C0000000A000000C811000078000000F81100009000000008120000A400000048120000C400000098120000E400000018130000040100004818000070010000A818000088010000D818000058010000F8180000A4010000 - - Name: .eh_frame - Type: SHT_PROGBITS - Flags: [ SHF_ALLOC ] - Address: 0x203048 - AddressAlign: 0x8 - Content: 1400000000000000017A5200017810011B0C070890010000140000001C0000004811000026000000004407100000000010000000340000006011000005000000000000001C000000480000005C1100003900000000410E108602430D06740C07080000001C000000680000007C1100004C00000000410E108602430D0602470C070800001C00000088000000AC1100007900000000440E7002670E78440E8001490E080050000000A80000000C1200002C05000000420E10420E18420E20420E28410E30470ED00C83068C058D048E038F02037F030ED80C440EE00C490ED00C0381010E30410E28420E20420E18420E10420E08410ED00C14000000FC000000781700002000000000410E105A0E08001400000014010000D01600005B00000000440E3002520E08180000002C0100001817000022000000004D0E10540E08000000000014000000480100004C17000012000000000000000000000000000000 - - Name: .text - Type: SHT_PROGBITS - Flags: [ SHF_ALLOC, SHF_EXECINSTR ] - Address: 0x2041B0 - AddressAlign: 0x10 - Content: F30F1EFA31ED4989D15E4889E24883E4F050544531C031C9488D3D31010000FF15FB1A0000F4662E0F1F840000000000F30F1EFAC3CCCCCCCCCCCCCCCCCCCCCC554889E5F605052C0000017402EB28C605FA2B000001488B05D31A00004885C07415EB00488D3D2DEEFFFF488D35E62B0000E8090700005DC30F1F8000000000554889E5F6050D2C0000017402EB3BC605022C000001488B059B1A00004885C0740EEB00488B3DA52A0000E8E0060000488B05891A00004885C0740EEB00488D3DD3EDFFFFE8D60600005DC3CCCCCCCC4883EC6848897C244848897424404889542438488D4424484889442450488D4424404889442458488D4424384889442460488D7C2428488D742418488D542410488D4C2408E896060000488B7424288B542430488B4C2418448B4424204C8D4C2450BF900D2000FF742408FF742418E87C0600004883C478C30F1F80000000004157415641554154534881EC20060000C744240400000000488D7C2404E85E060000BF806E2000BECA0D2000BA06000000E85A0600008B742404BF806E2000E85C0600004889C3BEA20D2000BA0D0000004889C7E837060000488B03488B40E84C8BB403F00000004D85F60F84B604000041807E38007407410FB64643EB164C89F7E829060000498B064C89F7BE0A000000FF50300FBEF04889DFE8200600004889C7E828060000837C2404000F8E2401000031DB4C8D742460EB3B0F1F40004C89FFE8E8050000498B074C89FFBE0A000000FF50300FBEF0BF806E2000E8DD0500004889C7E8E5050000FFC33B5C24040F8DE0000000BF806E2000BE980D2000BA07000000E885050000BF806E200089DEE889050000488B08488B49E84C8BBC08F00000004D85FF0F84F803000041807F3800740A410FB64F43EB210F1F004C89FF4989C4E865050000498B074C89FFBE0A000000FF503089C14C89E00FBEF14889C7E8570500004889C7E85F0500004C89F789DEE865050000BF806E2000BED10D2000BA07000000E8010500004C89F7E859050000BF806E20004C89F64889C2E8E9040000488B05D2290000488B40E84C8BB8706F20004D85FF0F846503000041807F38000F84F3FEFFFF410FB64743E9FFFEFFFF31FFE822050000BFA00F0000E8280500004889C3BFA00F0000E81B0500004989C6660F6F0550C8FFFFB80C000000660F6F0D53C8FFFF660F6F152BC8FFFF660F6F1D53C8FFFF660F6F255BC8FFFF66662E0F1F840000000000660F6FE8660FFEE9F30F7F4483D0F30F7F6C83E0660F6FF0660FFEF0660FFEEDF3410F7F7486D0F3410F7F6C86E0483DEC030000743A660F6FE8660FFEEA660F6FF0660FFEF3F30F7F6C83F0F30F7F3483660FFEED660FFEF6F3410F7F6C86F0F3410F7F3486660FFEC44883C010EB90488D7C2408BEA00F0000E871040000488D7C2410BEA00F0000E862040000488B7C2408BAA00F00004889DEB901000000E85B040000488B7C2410BAA00F00004C89F6B901000000E84404000048BA0001000001000000488DBA00010000BE01000000B9010000004531C04531C9E82E04000085C00F8582000000488B442408488B4C2410488944245848894C245048C7442448E8030000488D4424584889442460488D4424504889442468488D4424484889442470488D7C2438488D742428488D542420488D4C2418E8F2020000488B7424388B542440488B4C2428448B4424304C8D4C2460BF900D2000FF742418FF742428E8D80200004883C410488B742408BAA00F00004889DFB902000000E87D0300004531FF4531E40F1F8000000000428B34A3BF806E2000E882030000BEA00D2000BA010000004889C7E8B0020000463B3CA3751549FFC44983C7034981FCE803000075CAE9B9000000BF806E2000BEB00D2000BA14000000E881020000BF806E20004C89E6E8340300004989C5BEEC0D2000BA020000004889C7E85F020000428B34A34C89EFE8130300004989C4BEC50D2000BA040000004889C7E83E0200004C89E74C89FEE8F3020000488B08488B49E84C8BBC08F00000004D85FF0F84B200000041807F38007407410FB64F43EB1E4C89FF4989C4E822020000498B074C89FFBE0A000000FF503089C14C89E00FBEF14889C7E8140200004889C7E81C020000488B05C5260000488B40E84C8BB8706F20004D85FF745C41807F38007407410FB64743EB164C89FFE8CF010000498B074C89FFBE0A000000FF50300FBEF0BF806E2000E8C40100004889C7E8CC0100004889DFE8540200004C89F7E84C02000031C04881C4200600005B415C415D415E415FC3E8440200000F1F40004883EC28488B3D1D2600004885FF7514BFD02F2000E8660200004889C7488905042600000F57C00F114424100F110424BE900D2000BAD90D2000B9D90D200041B8FFFFFFFF4531C9E843020000BF904820004883C428E9550000000F1F440000488B3DC12500004885FF741550E82E02000048C705AB250000000000004883C408C3CCCCCCCCCCCCCCCCCCCCCCCCCCCC50BF506E2000E8B5010000BF904A2000BE506E2000BA006D200058E9C0010000F30F1EFA488B151524000031F6E9AE010000CCCC - - Name: .init - Type: SHT_PROGBITS - Flags: [ SHF_ALLOC, SHF_EXECINSTR ] - Address: 0x2048F4 - AddressAlign: 0x4 - Content: F30F1EFA4883EC08488B05D51300004885C07402FFD04883C408C3 - - Name: .fini - Type: SHT_PROGBITS - Flags: [ SHF_ALLOC, SHF_EXECINSTR ] - Address: 0x204910 - AddressAlign: 0x4 - Content: F30F1EFA4883EC084883C408C3 - - Name: .plt - Type: SHT_PROGBITS - Flags: [ SHF_ALLOC, SHF_EXECINSTR ] - Address: 0x204920 - AddressAlign: 0x10 - Content: FF35EA230000FF25EC2300000F1F4000FF25EA2300006800000000E9E0FFFFFFFF25E22300006801000000E9D0FFFFFFFF25DA2300006802000000E9C0FFFFFFFF25D22300006803000000E9B0FFFFFFFF25CA2300006804000000E9A0FFFFFFFF25C22300006805000000E990FFFFFFFF25BA2300006806000000E980FFFFFFFF25B22300006807000000E970FFFFFFFF25AA2300006808000000E960FFFFFFFF25A22300006809000000E950FFFFFFFF259A230000680A000000E940FFFFFFFF2592230000680B000000E930FFFFFFFF258A230000680C000000E920FFFFFFFF2582230000680D000000E910FFFFFFFF257A230000680E000000E900FFFFFFFF2572230000680F000000E9F0FEFFFFFF256A2300006810000000E9E0FEFFFFFF25622300006811000000E9D0FEFFFFFF255A2300006812000000E9C0FEFFFFFF25522300006813000000E9B0FEFFFFFF254A2300006814000000E9A0FEFFFFFF25422300006815000000E990FEFFFFFF253A2300006816000000E980FEFFFFFF25322300006817000000E970FEFFFFFF252A2300006818000000E960FEFFFFFF25222300006819000000E950FEFFFFFF251A230000681A000000E940FEFFFF - - Name: .init_array - Type: SHT_INIT_ARRAY - Flags: [ SHF_WRITE, SHF_ALLOC ] - Address: 0x205AE0 - AddressAlign: 0x8 - Content: F041200000000000C0482000000000003048200000000000 - - Name: .fini_array - Type: SHT_FINI_ARRAY - Flags: [ SHF_WRITE, SHF_ALLOC ] - Address: 0x205AF8 - AddressAlign: 0x8 - Content: '3042200000000000' - - Name: .dynamic - Type: SHT_DYNAMIC - Flags: [ SHF_WRITE, SHF_ALLOC ] - Address: 0x205B00 - Link: .dynstr - AddressAlign: 0x8 - Entries: - - Tag: DT_RUNPATH - Value: 0x2E7 - - Tag: DT_NEEDED - Value: 0x257 - - Tag: DT_NEEDED - Value: 0x278 - - Tag: DT_NEEDED - Value: 0x316 - - Tag: DT_NEEDED - Value: 0x2B0 - - Tag: DT_NEEDED - Value: 0x2C6 - - Tag: DT_DEBUG - Value: 0x0 - - Tag: DT_RELA - Value: 0x200A18 - - Tag: DT_RELASZ - Value: 0x90 - - Tag: DT_RELAENT - Value: 0x18 - - Tag: DT_JMPREL - Value: 0x200AA8 - - Tag: DT_PLTRELSZ - Value: 0x288 - - Tag: DT_PLTGOT - Value: 0x206D08 - - Tag: DT_PLTREL - Value: 0x7 - - Tag: DT_SYMTAB - Value: 0x2002E8 - - Tag: DT_SYMENT - Value: 0x18 - - Tag: DT_STRTAB - Value: 0x2006F4 - - Tag: DT_STRSZ - Value: 0x320 - - Tag: DT_GNU_HASH - Value: 0x2006D0 - - Tag: DT_INIT_ARRAY - Value: 0x205AE0 - - Tag: DT_INIT_ARRAYSZ - Value: 0x18 - - Tag: DT_FINI_ARRAY - Value: 0x205AF8 - - Tag: DT_FINI_ARRAYSZ - Value: 0x8 - - Tag: DT_INIT - Value: 0x2048F4 - - Tag: DT_FINI - Value: 0x204910 - - Tag: DT_VERSYM - Value: 0x2005D0 - - Tag: DT_VERNEED - Value: 0x200610 - - Tag: DT_VERNEEDNUM - Value: 0x4 - - Tag: DT_NULL - Value: 0x0 - - Name: .got - Type: SHT_PROGBITS - Flags: [ SHF_WRITE, SHF_ALLOC ] - Address: 0x205CD0 - AddressAlign: 0x8 - Content: '00000000000000000000000000000000000000000000000000000000000000000000000000000000' - - Name: .relro_padding - Type: SHT_NOBITS - Flags: [ SHF_WRITE, SHF_ALLOC ] - Address: 0x205CF8 - AddressAlign: 0x1 - Size: 0x308 - - Name: .data - Type: SHT_PROGBITS - Flags: [ SHF_WRITE, SHF_ALLOC ] - Address: 0x206CF8 - AddressAlign: 0x8 - Content: 0000000000000000006D200000000000 - - Name: .got.plt - Type: SHT_PROGBITS - Flags: [ SHF_WRITE, SHF_ALLOC ] - Address: 0x206D08 - AddressAlign: 0x8 - Content: 005B200000000000000000000000000000000000000000003649200000000000464920000000000056492000000000006649200000000000764920000000000086492000000000009649200000000000A649200000000000B649200000000000C649200000000000D649200000000000E649200000000000F649200000000000064A200000000000164A200000000000264A200000000000364A200000000000464A200000000000564A200000000000664A200000000000764A200000000000864A200000000000964A200000000000A64A200000000000B64A200000000000C64A200000000000D64A200000000000 - - Name: .bss - Type: SHT_NOBITS - Flags: [ SHF_WRITE, SHF_ALLOC ] - Address: 0x206E00 - AddressAlign: 0x40 - Offset: 0x3DF8 - Size: 0x190 - - Name: .comment - Type: SHT_PROGBITS - Flags: [ SHF_MERGE, SHF_STRINGS ] - AddressAlign: 0x1 - EntSize: 0x1 - Content: 4C696E6B65723A20414D44204C4C442031392E302E3000414D4420636C616E672076657273696F6E2031392E302E306769742028202032343231322063393630313665636534313337356462646438663037356266333762643666633333323230376233290000414D4420636C616E672076657273696F6E2031392E302E3067697420282020323431393320373139633463633762336363396237353535333365363639656439316435373935346437373336352900 -Symbols: - - Name: __abi_tag - Type: STT_OBJECT - Section: .note.ABI-tag - Value: 0x2002C4 - Size: 0x20 - - Name: _dl_relocate_static_pie - Type: STT_FUNC - Section: .text - Value: 0x2041E0 - Size: 0x5 - Other: [ STV_HIDDEN ] - - Name: crtbegin.c - Type: STT_FILE - Index: SHN_ABS - - Name: __do_init - Type: STT_FUNC - Section: .text - Value: 0x2041F0 - Size: 0x39 - - Name: __do_init.__initialized - Type: STT_OBJECT - Section: .bss - Value: 0x206E00 - Size: 0x1 - - Name: __EH_FRAME_LIST__ - Type: STT_OBJECT - Section: .eh_frame - Value: 0x203048 - - Name: __do_init.__object - Type: STT_OBJECT - Section: .bss - Value: 0x206E08 - Size: 0x40 - - Name: __do_fini - Type: STT_FUNC - Section: .text - Value: 0x204230 - Size: 0x4C - - Name: __do_fini.__finalized - Type: STT_OBJECT - Section: .bss - Value: 0x206E48 - Size: 0x1 - - Name: __init - Type: STT_OBJECT - Section: .init_array - Value: 0x205AE0 - Size: 0x8 - - Name: __fini - Type: STT_OBJECT - Section: .fini_array - Value: 0x205AF8 - Size: 0x8 - - Name: __dso_handle - Type: STT_OBJECT - Section: .data - Value: 0x206D00 - Size: 0x8 - Other: [ STV_HIDDEN ] - - Name: simpleAdd.cpp - Type: STT_FILE - Index: SHN_ABS - - Name: _GLOBAL__sub_I_simpleAdd.cpp - Type: STT_FUNC - Section: .text - Value: 0x2048C0 - Size: 0x20 - - Name: _ZStL8__ioinit - Type: STT_OBJECT - Section: .bss - Value: 0x206E50 - Size: 0x1 - - Name: __hip_module_ctor - Type: STT_FUNC - Section: .text - Value: 0x204830 - Size: 0x5B - - Name: __hip_gpubin_handle - Type: STT_OBJECT - Section: .bss - Value: 0x206E58 - Size: 0x8 - - Name: __hip_fatbin_wrapper - Type: STT_OBJECT - Section: .hipFatBinSegment - Value: 0x202FD0 - Size: 0x18 - - Name: __hip_module_dtor - Type: STT_FUNC - Section: .text - Value: 0x204890 - Size: 0x22 - - Name: crtend.c - Type: STT_FILE - Index: SHN_ABS - - Name: __EH_FRAME_LIST_END__ - Type: STT_OBJECT - Section: .eh_frame - Value: 0x203048 - Size: 0x4 - Other: [ STV_HIDDEN ] - - Name: _GLOBAL_OFFSET_TABLE_ - Section: .got.plt - Value: 0x206D08 - Other: [ STV_HIDDEN ] - - Name: _DYNAMIC - Section: .dynamic - Value: 0x205B00 - Other: [ STV_HIDDEN ] - - Name: _init - Type: STT_FUNC - Section: .init - Value: 0x2048F4 - Other: [ STV_HIDDEN ] - - Name: _fini - Type: STT_FUNC - Section: .fini - Value: 0x204910 - Other: [ STV_HIDDEN ] - - Name: atexit - Type: STT_FUNC - Section: .text - Value: 0x2048E0 - Size: 0x12 - Other: [ STV_HIDDEN ] - - Name: _start - Type: STT_FUNC - Section: .text - Binding: STB_GLOBAL - Value: 0x2041B0 - Size: 0x26 - - Name: main - Type: STT_FUNC - Section: .text - Binding: STB_GLOBAL - Value: 0x204300 - Size: 0x52C - - Name: data_start - Section: .data - Binding: STB_WEAK - Value: 0x206CF8 - - Name: _IO_stdin_used - Type: STT_OBJECT - Section: .rodata - Binding: STB_GLOBAL - Value: 0x200D30 - Size: 0x4 - - Name: __libc_start_main - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: __data_start - Section: .data - Binding: STB_GLOBAL - Value: 0x206CF8 - - Name: __gmon_start__ - Binding: STB_WEAK - - Name: __register_frame_info - Type: STT_FUNC - Binding: STB_WEAK - - Name: __cxa_finalize - Type: STT_FUNC - Binding: STB_WEAK - - Name: __deregister_frame_info - Type: STT_FUNC - Binding: STB_WEAK - - Name: _Z24__device_stub__simpleAddPjPKjm - Type: STT_FUNC - Section: .text - Binding: STB_GLOBAL - Value: 0x204280 - Size: 0x79 - - Name: __hipPopCallConfiguration - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _Z9simpleAddPjPKjm - Type: STT_OBJECT - Section: .rodata - Binding: STB_GLOBAL - Value: 0x200D90 - Size: 0x8 - - Name: hipLaunchKernel - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: hipGetDeviceCount - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZSt4cout - Type: STT_OBJECT - Section: .bss - Binding: STB_GLOBAL - Value: 0x206E80 - Size: 0x110 - - Name: _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZNSolsEi - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZNKSt5ctypeIcE13_M_widen_initEv - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZNSo3putEc - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZNSo5flushEv - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: hipGetDevicePropertiesR0600 - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: strlen - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: hipSetDevice - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: malloc - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: hipMalloc - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: hipMemcpy - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: __hipPushCallConfiguration - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZNSo9_M_insertImEERSoT_ - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: free - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZSt16__throw_bad_castv - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZNSt8ios_base4InitC1Ev - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZNSt8ios_base4InitD1Ev - Type: STT_FUNC - Binding: STB_GLOBAL - Value: 0x204A90 - - Name: __cxa_atexit - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: __hipRegisterFatBinary - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: __hipRegisterFunction - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: __hipUnregisterFatBinary - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: __hip_cuid_b7062d8c32a4a933 - Type: STT_OBJECT - Section: .bss - Binding: STB_GLOBAL - Value: 0x206E60 - Size: 0x1 -DynamicSymbols: - - Name: __libc_start_main - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: __gmon_start__ - Binding: STB_WEAK - - Name: __register_frame_info - Type: STT_FUNC - Binding: STB_WEAK - - Name: __cxa_finalize - Type: STT_FUNC - Binding: STB_WEAK - - Name: __deregister_frame_info - Type: STT_FUNC - Binding: STB_WEAK - - Name: __hipPopCallConfiguration - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: hipLaunchKernel - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: hipGetDeviceCount - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZNSolsEi - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZNKSt5ctypeIcE13_M_widen_initEv - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZNSo3putEc - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZNSo5flushEv - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: hipGetDevicePropertiesR0600 - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: strlen - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: hipSetDevice - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: malloc - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: hipMalloc - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: hipMemcpy - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: __hipPushCallConfiguration - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZNSo9_M_insertImEERSoT_ - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: free - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZSt16__throw_bad_castv - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZNSt8ios_base4InitC1Ev - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: __cxa_atexit - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: __hipRegisterFatBinary - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: __hipRegisterFunction - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: __hipUnregisterFatBinary - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: _ZSt4cout - Type: STT_OBJECT - Section: .bss - Binding: STB_GLOBAL - Value: 0x206E80 - Size: 0x110 - - Name: _ZNSt8ios_base4InitD1Ev - Type: STT_FUNC - Binding: STB_GLOBAL - Value: 0x204A90 ... diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading.test new file mode 100644 index 0000000000000..9656172f2941c --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading.test @@ -0,0 +1,42 @@ +## Test that --offloading with a fatbin works correctly +# REQUIRES: target={{x86_64-.*-linux.*}} +# REQUIRES: amdgpu-registered-target + +# RUN: yaml2obj %s -o %t.elf +# RUN: llvm-readobj --offloading %t.elf > %t.out +# RUN: FileCheck %s --input-file=%t.out -DFILE_NAME=%t.elf + +# CHECK: host-x86_64-unknown-linux-- file://[[FILE_NAME]]#offset=8192&size=0 +# CHECK-NEXT: hipv4-amdgcn-amd-amdhsa--gfx908 file://[[FILE_NAME]]#offset=8192&size=4048 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 + Entry: 0x2041B0 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x200040 + Align: 0x8 + Offset: 0x40 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Align: 0x0 + Offset: 0x0 +Sections: + - Name: .hip_fatbin + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x201000 + AddressAlign: 0x1000 + Content: 5F5F434C414E475F4F46464C4F41445F42554E444C455F5F0200000000000000001000000000000000000000000000001B00000000000000686F73742D7838365F36342D756E6B6E6F776E2D6C696E75782D2D0010000000000000D00F0000000000001F0000000000000068697076342D616D6467636E2D616D642D616D646873612D2D676678393038000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000007F454C460201014003000000000000000300E0000100000000000000000000004000000000000000100C0000000000003005000040003800090040000F000D000600000004000000400000000000000040000000000000004000000000000000F801000000000000F80100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000C008000000000000C008000000000000001000000000000001000000050000000009000000000000001900000000000000190000000000006C000000000000006C00000000000000001000000000000001000000060000007009000000000000702900000000000070290000000000007000000000000000900600000000000000100000000000000100000006000000E009000000000000E039000000000000E039000000000000000000000000000001000000000000000010000000000000020000000600000070090000000000007029000000000000702900000000000070000000000000007000000000000000080000000000000052E574640400000070090000000000007029000000000000702900000000000070000000000000009006000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000400000004000000380200000000000038020000000000003802000000000000340500000000000034050000000000000400000000000000070000001D05000020000000414D44475055000083AE616D646873612E6B65726E656C7391DE0012AB2E616770725F636F756E7400A52E61726773DC001085AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA415F642E636F65726365A72E6F666673657400A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657285AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA425F642E636F65726365A72E6F666673657408A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657284A52E6E616D65A14EA72E6F666673657410A52E73697A6508AB2E76616C75655F6B696E64A862795F76616C756583A72E6F666673657418A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7883A72E6F66667365741CA52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7983A72E6F666673657420A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7A83A72E6F666673657424A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7883A72E6F666673657426A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7983A72E6F666673657428A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7A83A72E6F66667365742AA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7883A72E6F66667365742CA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7983A72E6F66667365742EA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7A83A72E6F666673657440A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7883A72E6F666673657448A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7983A72E6F666673657450A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7A83A72E6F666673657458A52E73697A6502AB2E76616C75655F6B696E64B068696464656E5F677269645F64696D73B92E67726F75705F7365676D656E745F66697865645F73697A6500B62E6B65726E6172675F7365676D656E745F616C69676E08B52E6B65726E6172675F7365676D656E745F73697A65CD0118A92E6C616E6775616765A84F70656E434C2043B12E6C616E67756167655F76657273696F6E920200B82E6D61785F666C61745F776F726B67726F75705F73697A65CD0400A52E6E616D65B25F5A3973696D706C65416464506A504B6A6DBB2E707269766174655F7365676D656E745F66697865645F73697A6500AB2E736770725F636F756E740CB12E736770725F7370696C6C5F636F756E7400A72E73796D626F6CB55F5A3973696D706C65416464506A504B6A6D2E6B64B82E756E69666F726D5F776F726B5F67726F75705F73697A6501B32E757365735F64796E616D69635F737461636BC2AB2E766770725F636F756E7404B12E766770725F7370696C6C5F636F756E7400AF2E7761766566726F6E745F73697A6540AD616D646873612E746172676574B9616D6467636E2D616D642D616D646873612D2D676678393038AE616D646873612E76657273696F6E92010200000000000000000000000000000000000000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E03900000000000001000000000000000100000001000000010000001A000000000008400000D20001000000360A4A7A5238A4D3F113F4DD04000000040000000200000001000000000000000300000000000000000000000000000000000000005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F623730363264386333326134613933330000000000000000000000000000000000000000000000000000000000000000000000180100000000000080100000000000000000000000000000000000000000000000000000000000004000AF008C000000090000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000C20102C02400000002000AC0000000008002027E7FC08CBF07FF0486FFFF0000060406920600006800008FD2820002000302067E0200043203030638008050DC02007F020102067E0000003203030238008050DC00007F03700F8CBF03050468008070DC00027F00000081BF00000000060000000000000070070000000000000B000000000000001800000000000000050000000000000020080000000000000A000000000000004600000000000000F5FEFF6F00000000D0070000000000000400000000000000F807000000000000000000000000000000000000000000004C696E6B65723A20414D44204C4C442031392E302E3000414D4420636C616E672076657273696F6E2031392E302E306769742028202032343231322063393630313665636534313337356462646438663037356266333762643666633333323230376233290000414D4420636C616E672076657273696F6E2031382E302E3067697420287373683A2F2F6765727269746769742F6C696768746E696E672F65632F6C6C766D2D70726F6A65637420616D642D6D61696E6C696E652D6F70656E20323431373620663935303039613166393032313232343865313036333964653837653635636163616338643961372900000000000000000000000000000000000000000000000000460000000002080070290000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E0390000000000000100000000000000002E6E6F7465002E64796E73796D002E676E752E68617368002E68617368002E64796E737472002E726F64617461002E74657874002E64796E616D6963002E72656C726F5F70616464696E67002E627373002E636F6D6D656E74002E73796D746162002E7368737472746162002E73747274616200005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F62373036326438633332613461393333005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000070000000200000000000000380200000000000038020000000000003405000000000000000000000000000004000000000000000000000000000000070000000B00000002000000000000007007000000000000700700000000000060000000000000000500000001000000080000000000000018000000000000000F000000F6FFFF6F0200000000000000D007000000000000D007000000000000280000000000000002000000000000000800000000000000000000000000000019000000050000000200000000000000F807000000000000F80700000000000028000000000000000200000000000000040000000000000004000000000000001F000000030000000200000000000000200800000000000020080000000000004600000000000000000000000000000001000000000000000000000000000000270000000100000002000000000000008008000000000000800800000000000040000000000000000000000000000000400000000000000000000000000000002F000000010000000600000000000000001900000000000000090000000000006C00000000000000000000000000000000010000000000000000000000000000350000000600000003000000000000007029000000000000700900000000000070000000000000000500000000000000080000000000000010000000000000003E000000080000000300000000000000E029000000000000E00900000000000020060000000000000000000000000000010000000000000000000000000000004D000000080000000300000000000000E039000000000000E0090000000000000100000000000000000000000000000001000000000000000000000000000000520000000100000030000000000000000000000000000000E009000000000000F0000000000000000000000000000000010000000000000001000000000000005B0000000200000000000000000000000000000000000000D00A00000000000078000000000000000E0000000200000008000000000000001800000000000000630000000300000000000000000000000000000000000000480B00000000000075000000000000000000000000000000010000000000000000000000000000006D0000000300000000000000000000000000000000000000BD0B0000000000004F00000000000000000000000000000001000000000000000000000000000000 + - Name: .hipFatBinSegment + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x202FD0 + AddressAlign: 0x8 + Content: '465049480100000000102000000000000000000000000000' +... diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp index ae965e6f486aa..b42bfbed3b873 100644 --- a/llvm/tools/gold/gold-plugin.cpp +++ b/llvm/tools/gold/gold-plugin.cpp @@ -1119,7 +1119,9 @@ static std::vector, bool>> runLTO() { auto AddBuffer = [&](size_t Task, const Twine &moduleName, std::unique_ptr MB) { - *AddStream(Task, moduleName)->OS << MB->getBuffer(); + auto Stream = *AddStream(Task, ModuleName); + Stream->OS << MB->getBuffer(); + check(Stream->commit(), "Failed to commit cache"); }; FileCache Cache; diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp index 448660a539a0b..19246f0394167 100644 --- a/llvm/tools/lli/lli.cpp +++ b/llvm/tools/lli/lli.cpp @@ -27,9 +27,7 @@ #include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/DebugUtils.h" #include "llvm/ExecutionEngine/Orc/Debugging/DebuggerSupport.h" -#include "llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h" #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h" -#include "llvm/ExecutionEngine/Orc/EPCEHFrameRegistrar.h" #include "llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h" #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" #include "llvm/ExecutionEngine/Orc/IRPartitionLayer.h" @@ -1033,14 +1031,10 @@ int runOrcJIT(const char *ProgName) { Builder.getJITTargetMachineBuilder() ->setRelocationModel(Reloc::PIC_) .setCodeModel(CodeModel::Small); - Builder.setObjectLinkingLayerCreator([&P](orc::ExecutionSession &ES, - const Triple &TT) { - auto L = std::make_unique(ES); - if (P != LLJITPlatform::ExecutorNative) - L->addPlugin(std::make_unique( - ES, ExitOnErr(orc::EPCEHFrameRegistrar::Create(ES)))); - return L; - }); + Builder.setObjectLinkingLayerCreator( + [&](orc::ExecutionSession &ES, const Triple &TT) { + return std::make_unique(ES); + }); } auto J = ExitOnErr(Builder.create()); diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp index d4f022ef021a4..270510a013193 100644 --- a/llvm/tools/llvm-lto2/llvm-lto2.cpp +++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -448,7 +448,9 @@ static int run(int argc, char **argv) { auto AddBuffer = [&](size_t Task, const Twine &ModuleName, std::unique_ptr MB) { - *AddStream(Task, ModuleName)->OS << MB->getBuffer(); + auto Stream = AddStream(Task, ModuleName); + *Stream->OS << MB->getBuffer(); + check(Stream->commit(), "Failed to commit cache"); }; FileCache Cache; diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp index 0d209590655ef..0b29ff85edd92 100644 --- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp +++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp @@ -16,6 +16,8 @@ #include "llvm/ObjCopy/ConfigManager.h" #include "llvm/ObjCopy/MachO/MachOConfig.h" #include "llvm/Object/Binary.h" +#include "llvm/Object/OffloadBinary.h" +#include "llvm/Object/OffloadBundle.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/CRC.h" @@ -284,6 +286,11 @@ static Expected parseVisibilityType(StringRef VisType) { return type; } +static void parseDumpOffloadBundle(StringRef URI) { + if (Error Err = object::extractOffloadBundleByURI(URI)) + outs() << "Failed to extract from URI."; +} + namespace { struct TargetInfo { FileFormat Format; @@ -727,6 +734,15 @@ objcopy::parseObjcopyOptions(ArrayRef ArgsArr, SmallVector Positional; + ConfigManager ConfigMgr; + CommonConfig &Config = ConfigMgr.Common; + COFFConfig &COFFConfig = ConfigMgr.COFF; + ELFConfig &ELFConfig = ConfigMgr.ELF; + MachOConfig &MachOConfig = ConfigMgr.MachO; + + if (InputArgs.hasArg(OBJCOPY_dump_offload_bundle)) + Config.NeedPositional = false; + for (auto *Arg : InputArgs.filtered(OBJCOPY_UNKNOWN)) return createStringError(errc::invalid_argument, "unknown argument '%s'", Arg->getAsString(InputArgs).c_str()); @@ -734,27 +750,28 @@ objcopy::parseObjcopyOptions(ArrayRef ArgsArr, for (auto *Arg : InputArgs.filtered(OBJCOPY_INPUT)) Positional.push_back(Arg->getValue()); - if (Positional.empty()) + if (Positional.empty() && Config.NeedPositional) return createStringError(errc::invalid_argument, "no input file specified"); - if (Positional.size() > 2) + if (Positional.size() > 2 && Config.NeedPositional) return createStringError(errc::invalid_argument, "too many positional arguments"); - ConfigManager ConfigMgr; - CommonConfig &Config = ConfigMgr.Common; - COFFConfig &COFFConfig = ConfigMgr.COFF; - ELFConfig &ELFConfig = ConfigMgr.ELF; - MachOConfig &MachOConfig = ConfigMgr.MachO; - Config.InputFilename = Positional[0]; - Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1]; - if (InputArgs.hasArg(OBJCOPY_target) && - (InputArgs.hasArg(OBJCOPY_input_target) || - InputArgs.hasArg(OBJCOPY_output_target))) - return createStringError( - errc::invalid_argument, - "--target cannot be used with --input-target or --output-target"); + if (Arg *A = InputArgs.getLastArg(OBJCOPY_dump_offload_bundle)) { + for (StringRef URIStr : llvm::split(A->getValue(), ",")) + parseDumpOffloadBundle(URIStr); + } + if (Config.NeedPositional) { + Config.InputFilename = Positional[0]; + Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1]; + if (InputArgs.hasArg(OBJCOPY_target) && + (InputArgs.hasArg(OBJCOPY_input_target) || + InputArgs.hasArg(OBJCOPY_output_target))) + return createStringError( + errc::invalid_argument, + "--target cannot be used with --input-target or --output-target"); + } if (InputArgs.hasArg(OBJCOPY_regex) && InputArgs.hasArg(OBJCOPY_wildcard)) return createStringError(errc::invalid_argument, "--regex and --wildcard are incompatible"); @@ -1417,25 +1434,26 @@ objcopy::parseInstallNameToolOptions(ArrayRef ArgsArr) { Arg->getAsString(InputArgs).c_str()); for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_INPUT)) Positional.push_back(Arg->getValue()); - if (Positional.empty()) + if (Positional.empty() && Config.NeedPositional) return createStringError(errc::invalid_argument, "no input file specified"); - if (Positional.size() > 1) + if (Positional.size() > 1 && Config.NeedPositional) return createStringError( errc::invalid_argument, "llvm-install-name-tool expects a single input file"); - Config.InputFilename = Positional[0]; - Config.OutputFilename = Positional[0]; - - Expected> BinaryOrErr = - createBinary(Config.InputFilename); - if (!BinaryOrErr) - return createFileError(Config.InputFilename, BinaryOrErr.takeError()); - auto *Binary = (*BinaryOrErr).getBinary(); - if (!Binary->isMachO() && !Binary->isMachOUniversalBinary()) - return createStringError(errc::invalid_argument, - "input file: %s is not a Mach-O file", - Config.InputFilename.str().c_str()); - + if (Config.NeedPositional) { + Config.InputFilename = Positional[0]; + Config.OutputFilename = Positional[0]; + + Expected> BinaryOrErr = + createBinary(Config.InputFilename); + if (!BinaryOrErr) + return createFileError(Config.InputFilename, BinaryOrErr.takeError()); + auto *Binary = (*BinaryOrErr).getBinary(); + if (!Binary->isMachO() && !Binary->isMachOUniversalBinary()) + return createStringError(errc::invalid_argument, + "input file: %s is not a Mach-O file", + Config.InputFilename.str().c_str()); + } DC.CopyConfigs.push_back(std::move(ConfigMgr)); return std::move(DC); } @@ -1474,13 +1492,16 @@ objcopy::parseBitcodeStripOptions(ArrayRef ArgsArr, Arg->getAsString(InputArgs).c_str()); SmallVector Positional; - for (auto *Arg : InputArgs.filtered(BITCODE_STRIP_INPUT)) - Positional.push_back(Arg->getValue()); - if (Positional.size() > 1) - return createStringError(errc::invalid_argument, - "llvm-bitcode-strip expects a single input file"); - assert(!Positional.empty()); - Config.InputFilename = Positional[0]; + if (Config.NeedPositional) { + for (auto *Arg : InputArgs.filtered(BITCODE_STRIP_INPUT)) + Positional.push_back(Arg->getValue()); + if (Positional.size() > 1) + return createStringError( + errc::invalid_argument, + "llvm-bitcode-strip expects a single input file"); + assert(!Positional.empty()); + Config.InputFilename = Positional[0]; + } if (!InputArgs.hasArg(BITCODE_STRIP_output)) { return createStringError(errc::invalid_argument, @@ -1542,27 +1563,31 @@ objcopy::parseStripOptions(ArrayRef RawArgsArr, exit(0); } - SmallVector Positional; - for (auto *Arg : InputArgs.filtered(STRIP_UNKNOWN)) - return createStringError(errc::invalid_argument, "unknown argument '%s'", - Arg->getAsString(InputArgs).c_str()); - for (auto *Arg : InputArgs.filtered(STRIP_INPUT)) - Positional.push_back(Arg->getValue()); - std::copy(DashDash, RawArgsArr.end(), std::back_inserter(Positional)); - - if (Positional.empty()) - return createStringError(errc::invalid_argument, "no input file specified"); - - if (Positional.size() > 1 && InputArgs.hasArg(STRIP_output)) - return createStringError( - errc::invalid_argument, - "multiple input files cannot be used in combination with -o"); - ConfigManager ConfigMgr; CommonConfig &Config = ConfigMgr.Common; ELFConfig &ELFConfig = ConfigMgr.ELF; MachOConfig &MachOConfig = ConfigMgr.MachO; + SmallVector Positional; + if (Config.NeedPositional) { + for (auto *Arg : InputArgs.filtered(STRIP_UNKNOWN)) + return createStringError(errc::invalid_argument, "unknown argument '%s'", + Arg->getAsString(InputArgs).c_str()); + for (auto *Arg : InputArgs.filtered(STRIP_INPUT)) + Positional.push_back(Arg->getValue()); + std::copy(DashDash, RawArgsArr.end(), std::back_inserter(Positional)); + + if (Positional.empty()) + return createStringError(errc::invalid_argument, + "no input file specified"); + + if (Positional.size() > 1 && InputArgs.hasArg(STRIP_output)) { + return createStringError( + errc::invalid_argument, + "multiple input files cannot be used in combination with -o"); + } + } + if (InputArgs.hasArg(STRIP_regex) && InputArgs.hasArg(STRIP_wildcard)) return createStringError(errc::invalid_argument, "--regex and --wildcard are incompatible"); diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td index fbc6a59d9461e..c6216a6b8a627 100644 --- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td +++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td @@ -239,6 +239,9 @@ defm dump_section : Eq<"dump-section", "Dump contents of section named

into file ">, MetaVarName<"section=file">; + +defm dump_offload_bundle : Eq<"dump-offload-bundle", "Dump the contents specified by URI">; + defm prefix_symbols : Eq<"prefix-symbols", "Add to the start of every symbol name">, MetaVarName<"prefix">; diff --git a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp index 7e708e309f207..362f022761eaa 100644 --- a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp +++ b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp @@ -138,88 +138,89 @@ static Error executeObjcopyOnRawBinary(ConfigManager &ConfigMgr, static Error executeObjcopy(ConfigManager &ConfigMgr) { CommonConfig &Config = ConfigMgr.Common; - Expected PermsApplierOrErr = - FilePermissionsApplier::create(Config.InputFilename); - if (!PermsApplierOrErr) - return PermsApplierOrErr.takeError(); + if (Config.NeedPositional) { + Expected PermsApplierOrErr = + FilePermissionsApplier::create(Config.InputFilename); + if (!PermsApplierOrErr) + return PermsApplierOrErr.takeError(); - std::function ObjcopyFunc; + std::function ObjcopyFunc; - OwningBinary BinaryHolder; - std::unique_ptr MemoryBufferHolder; + OwningBinary BinaryHolder; + std::unique_ptr MemoryBufferHolder; - if (Config.InputFormat == FileFormat::Binary || - Config.InputFormat == FileFormat::IHex) { - ErrorOr> BufOrErr = - MemoryBuffer::getFileOrSTDIN(Config.InputFilename); - if (!BufOrErr) - return createFileError(Config.InputFilename, BufOrErr.getError()); - MemoryBufferHolder = std::move(*BufOrErr); + if (Config.InputFormat == FileFormat::Binary || + Config.InputFormat == FileFormat::IHex) { + ErrorOr> BufOrErr = + MemoryBuffer::getFileOrSTDIN(Config.InputFilename); + if (!BufOrErr) + return createFileError(Config.InputFilename, BufOrErr.getError()); + MemoryBufferHolder = std::move(*BufOrErr); - if (Config.InputFormat == FileFormat::Binary) - ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { - // Handle FileFormat::Binary. - return executeObjcopyOnRawBinary(ConfigMgr, *MemoryBufferHolder, - OutFile); - }; - else - ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { - // Handle FileFormat::IHex. - return executeObjcopyOnIHex(ConfigMgr, *MemoryBufferHolder, OutFile); - }; - } else { - Expected> BinaryOrErr = - createBinary(Config.InputFilename); - if (!BinaryOrErr) - return createFileError(Config.InputFilename, BinaryOrErr.takeError()); - BinaryHolder = std::move(*BinaryOrErr); - - if (Archive *Ar = dyn_cast(BinaryHolder.getBinary())) { - // Handle Archive. - if (Error E = executeObjcopyOnArchive(ConfigMgr, *Ar)) - return E; + if (Config.InputFormat == FileFormat::Binary) { + ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { + // Handle FileFormat::Binary. + return executeObjcopyOnRawBinary(ConfigMgr, *MemoryBufferHolder, + OutFile); + }; + } else + ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { + // Handle FileFormat::IHex. + return executeObjcopyOnIHex(ConfigMgr, *MemoryBufferHolder, OutFile); + }; } else { - // Handle llvm::object::Binary. - ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { - return executeObjcopyOnBinary(ConfigMgr, *BinaryHolder.getBinary(), - OutFile); - }; - } - } + Expected> BinaryOrErr = + createBinary(Config.InputFilename); + if (!BinaryOrErr) + return createFileError(Config.InputFilename, BinaryOrErr.takeError()); + BinaryHolder = std::move(*BinaryOrErr); - if (ObjcopyFunc) { - if (Config.SplitDWO.empty()) { - // Apply transformations described by Config and store result into - // Config.OutputFilename using specified ObjcopyFunc function. - if (Error E = writeToOutput(Config.OutputFilename, ObjcopyFunc)) - return E; - } else { - Config.ExtractDWO = true; - Config.StripDWO = false; - // Copy .dwo tables from the Config.InputFilename into Config.SplitDWO - // file using specified ObjcopyFunc function. - if (Error E = writeToOutput(Config.SplitDWO, ObjcopyFunc)) - return E; - Config.ExtractDWO = false; - Config.StripDWO = true; - // Apply transformations described by Config, remove .dwo tables and - // store result into Config.OutputFilename using specified ObjcopyFunc - // function. - if (Error E = writeToOutput(Config.OutputFilename, ObjcopyFunc)) - return E; + if (Archive *Ar = dyn_cast(BinaryHolder.getBinary())) { + // Handle Archive. + if (Error E = executeObjcopyOnArchive(ConfigMgr, *Ar)) + return E; + } else { + // Handle llvm::object::Binary. + ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { + return executeObjcopyOnBinary(ConfigMgr, *BinaryHolder.getBinary(), + OutFile); + }; + } } - } - if (Error E = - PermsApplierOrErr->apply(Config.OutputFilename, Config.PreserveDates)) - return E; + if (ObjcopyFunc) { + if (Config.SplitDWO.empty()) { + // Apply transformations described by Config and store result into + // Config.OutputFilename using specified ObjcopyFunc function. + if (Error E = writeToOutput(Config.OutputFilename, ObjcopyFunc)) + return E; + } else { + Config.ExtractDWO = true; + Config.StripDWO = false; + // Copy .dwo tables from the Config.InputFilename into Config.SplitDWO + // file using specified ObjcopyFunc function. + if (Error E = writeToOutput(Config.SplitDWO, ObjcopyFunc)) + return E; + Config.ExtractDWO = false; + Config.StripDWO = true; + // Apply transformations described by Config, remove .dwo tables and + // store result into Config.OutputFilename using specified ObjcopyFunc + // function. + if (Error E = writeToOutput(Config.OutputFilename, ObjcopyFunc)) + return E; + } + } - if (!Config.SplitDWO.empty()) - if (Error E = - PermsApplierOrErr->apply(Config.SplitDWO, Config.PreserveDates, - static_cast(0666))) + if (Error E = PermsApplierOrErr->apply(Config.OutputFilename, + Config.PreserveDates)) return E; + if (!Config.SplitDWO.empty()) + if (Error E = + PermsApplierOrErr->apply(Config.SplitDWO, Config.PreserveDates, + static_cast(0666))) + return E; + } return Error::success(); } diff --git a/llvm/tools/llvm-objdump/OffloadDump.cpp b/llvm/tools/llvm-objdump/OffloadDump.cpp index b129838cad5c0..834508fd5a572 100644 --- a/llvm/tools/llvm-objdump/OffloadDump.cpp +++ b/llvm/tools/llvm-objdump/OffloadDump.cpp @@ -15,6 +15,7 @@ #include "llvm-objdump.h" #include "llvm/Object/ELFObjectFile.h" #include "llvm/Object/OffloadBinary.h" +#include "llvm/Object/OffloadBundle.h" #include "llvm/Support/Alignment.h" using namespace llvm; @@ -51,7 +52,7 @@ static void printBinary(const OffloadBinary &OB, uint64_t Index) { } /// Print the embedded offloading contents of an ObjectFile \p O. -void llvm::dumpOffloadBinary(const ObjectFile &O) { +void llvm::dumpOffloadBinary(const ObjectFile &O, StringRef ArchName) { if (!O.isELF() && !O.isCOFF()) { reportWarning( "--offloading is currently only supported for COFF and ELF targets", @@ -67,55 +68,53 @@ void llvm::dumpOffloadBinary(const ObjectFile &O) { // Print out all the binaries that are contained in this buffer. for (uint64_t I = 0, E = Binaries.size(); I != E; ++I) printBinary(*Binaries[I].getBinary(), I); + + dumpOffloadBundleFatBinary(O, ArchName); } // Given an Object file, collect all Bundles of FatBin Binaries // and dump them into Code Object files -// if -d is specified, disassemble the Code Object Files // if -arch=-name is specified, only dump the Entries that match the target arch -void llvm::dumpOffloadBundleFatBinary(const ObjectFile &O, std::string ArchName, - bool Disassemble) { - assert((O.isELF() || O.isCOFF()) && "Invalid file type"); - // Collect all Bundles and their Entries .... +void llvm::dumpOffloadBundleFatBinary(const ObjectFile &O, StringRef ArchName) { + if (!O.isELF() && !O.isCOFF()) { + reportWarning( + "--offloading is currently only supported for COFF and ELF targets", + O.getFileName()); + return; + } + SmallVector FoundBundles; - SmallVector FoundEntries; if (Error Err = llvm::object::extractOffloadBundleFatBinary(O, FoundBundles)) reportError(O.getFileName(), "while extracting offload FatBin bundles: " + toString(std::move(Err))); + for (const auto &[BundleNum, Bundle] : llvm::enumerate(FoundBundles)) { + for (OffloadBundleEntry &Entry : Bundle.getEntries()) { + if (!ArchName.empty() && (Entry.ID.find(ArchName) != std::string::npos)) + continue; - // Now filter based on if arch-name is specified - SmallVectorImpl::iterator BundleIter = - FoundBundles.begin(); - for (uint64_t bundle_num = 0; bundle_num < FoundBundles.size(); - bundle_num++) { - if (!ArchName.empty()) - FoundEntries = BundleIter->EntryIDContains(StringRef(ArchName)); - else - FoundEntries = BundleIter->getEntries(); - - // now we have a list of Found Entries .... dump them - SmallVectorImpl::iterator FoundIter = - FoundEntries.begin(); - for (int64_t entry_num = 0; entry_num < FoundEntries.size(); entry_num++) { - // create file name for this object file: :.. - std::string str = BundleIter->getFileName().str() + ":" + - itostr(bundle_num) + "." + FoundIter->ID.str(); - StringRef OutputFilename = StringRef(str); - if (Error Err = object::extractCodeObject( - O, FoundIter->Offset, FoundIter->Size, OutputFilename)) - reportError(O.getFileName(), - "while extracting offload Bundle Entries: " + - toString(std::move(Err))); - - // TODO: If -d was specified, disasseble the Code Object too - - ++FoundIter; - } // end of for found_entries loop - - ++BundleIter; - } // end of for Bundles loop + std::string str = + Bundle.getFileName().str() + "." + itostr(BundleNum) + "." + Entry.ID; + + if (Bundle.isDecompressed()) { + if (Error Err = object::extractCodeObject( + Bundle.DecompressedBuffer->getMemBufferRef(), Entry.Offset, + Entry.Size, StringRef(str))) + reportError(O.getFileName(), + "while extracting offload Bundle Entries: " + + toString(std::move(Err))); + } else { + if (Error Err = object::extractCodeObject(O, Entry.Offset, Entry.Size, + StringRef(str))) + reportError(O.getFileName(), + "while extracting offload Bundle Entries: " + + toString(std::move(Err))); + } + outs() << "Extracting offload bundle: " << str << "\n"; + } + } } /// Print the contents of an offload binary file \p OB. This may contain diff --git a/llvm/tools/llvm-objdump/OffloadDump.h b/llvm/tools/llvm-objdump/OffloadDump.h index 5eddce9d247bc..229d479ae357b 100644 --- a/llvm/tools/llvm-objdump/OffloadDump.h +++ b/llvm/tools/llvm-objdump/OffloadDump.h @@ -11,15 +11,16 @@ #include "llvm/Object/ObjectFile.h" #include "llvm/Object/OffloadBinary.h" +#include "llvm/Object/OffloadBundle.h" namespace llvm { void dumpOffloadSections(const object::OffloadBinary &OB); -void dumpOffloadBinary(const object::ObjectFile &O); +void dumpOffloadBinary(const object::ObjectFile &O, StringRef ArchName); /// Dump fat binary in binary clang-offload-bundler format void dumpOffloadBundleFatBinary(const object::ObjectFile &O, - std::string ArchName, bool Disassemble); + StringRef ArchName); } // namespace llvm #endif diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp index d3e6a803682c7..efe400a4b1e5a 100644 --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -56,6 +56,7 @@ #include "llvm/Object/MachO.h" #include "llvm/Object/MachOUniversal.h" #include "llvm/Object/OffloadBinary.h" +#include "llvm/Object/OffloadBundle.h" #include "llvm/Object/Wasm.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" @@ -3318,7 +3319,7 @@ static void dumpObject(ObjectFile *O, const Archive *A = nullptr, if (SectionContents) printSectionContents(O); if (OffloadFatBin) - dumpOffloadBundleFatBinary(*O, ArchName, Disassemble); + dumpOffloadBundleFatBinary(*O, ArchName); if (Disassemble) disassembleObject(O, Relocations); if (UnwindInfo) @@ -3342,7 +3343,7 @@ static void dumpObject(ObjectFile *O, const Archive *A = nullptr, if (FaultMapSection) printFaultMaps(O); if (Offloading) - dumpOffloadBinary(*O); + dumpOffloadBinary(*O, StringRef(ArchName)); } static void dumpObject(const COFFImportFile *I, const Archive *A, diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp index d3c613ee823ba..326e6a41531aa 100644 --- a/llvm/tools/llvm-readobj/ObjDumper.cpp +++ b/llvm/tools/llvm-readobj/ObjDumper.cpp @@ -16,6 +16,8 @@ #include "llvm/Object/Archive.h" #include "llvm/Object/Decompressor.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Object/OffloadBinary.h" +#include "llvm/Object/OffloadBundle.h" #include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/ScopedPrinter.h" @@ -230,4 +232,21 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj, } } +// TODO: add proper error handling. +void ObjDumper::printOffloading(const object::ObjectFile &Obj) { + // we can use an argument to let user select which offloading section they + // want to print. but for now, we're hardcoding ELF and "hip_fatbin". + assert((Obj.isELF() || Obj.isCOFF()) && "Invalid file type"); + + SmallVector Bundles; + if (Error Err = llvm::object::extractOffloadBundleFatBinary(Obj, Bundles)) + reportWarning(createError("Cannot extract Fatbin Binary from Object."), + Obj.getFileName()); + + // Print out all the FatBin Bundles that are contained in this buffer. + for (const auto &[Index, Bundle] : llvm::enumerate(Bundles)) { + Bundle.printEntriesAsURI(); + } +} + } // namespace llvm diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h index cd744e3bbfb71..aaa294c3c3f25 100644 --- a/llvm/tools/llvm-readobj/ObjDumper.h +++ b/llvm/tools/llvm-readobj/ObjDumper.h @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Object/OffloadBinary.h" #include "llvm/Support/CommandLine.h" #include @@ -184,6 +185,7 @@ class ObjDumper { std::function WarningHandler; void reportUniqueWarning(Error Err) const; void reportUniqueWarning(const Twine &Msg) const; + void printOffloading(const object::ObjectFile &Obj); protected: ScopedPrinter &W; diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td index 7d574d875d22e..df681555ea9e1 100644 --- a/llvm/tools/llvm-readobj/Opts.td +++ b/llvm/tools/llvm-readobj/Opts.td @@ -64,6 +64,9 @@ def notes : FF<"notes", "Display notes">, Group; def program_headers : FF<"program-headers", "Display program headers">, Group; def version_info : FF<"version-info", "Display version sections">, Group; +def offloading : Flag<["--"], "offloading">, + HelpText<"Display the content of the offloading section">; + // Mach-O specific options. def grp_mach_o : OptionGroup<"kind">, HelpText<"OPTIONS (Mach-O specific)">; def macho_data_in_code : FF<"macho-data-in-code", "Display Data in Code command">, Group; diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp index 2f77e5d350553..ac96049dfd3a2 100644 --- a/llvm/tools/llvm-readobj/llvm-readobj.cpp +++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp @@ -138,6 +138,7 @@ static bool Notes; static bool ProgramHeaders; static bool SectionGroups; static bool VersionInfo; +static bool Offloading; // Mach-O specific options. static bool MachODataInCode; @@ -289,6 +290,7 @@ static void parseOptions(const opt::InputArgList &Args) { } } opts::VersionInfo = Args.hasArg(OPT_version_info); + opts::Offloading = Args.hasArg(OPT_offloading); // Mach-O specific options. opts::MachODataInCode = Args.hasArg(OPT_macho_data_in_code); @@ -456,6 +458,8 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer, Dumper->printGnuHashTable(); if (opts::VersionInfo) Dumper->printVersionInfo(); + if (opts::Offloading) + Dumper->printOffloading(Obj); if (opts::StringTable) Dumper->printStringTable(); if (Obj.isELF()) { @@ -700,6 +704,7 @@ int llvm_readobj_main(int argc, char **argv, const llvm::ToolContext &) { opts::DynamicTable = true; opts::Notes = true; opts::VersionInfo = true; + opts::Offloading = true; opts::UnwindInfo = true; opts::SectionGroups = true; opts::HashHistogram = true; diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 91774555c79ac..a8c6e7d6a1ac6 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -6197,6 +6197,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) { F->addFnAttr("target-features", "+mmx,+sse"); IRBuilder<> Builder(BB); auto *Int32Ty = Builder.getInt32Ty(); + Builder.SetCurrentDebugLocation(DL); AllocaInst *APtr = Builder.CreateAlloca(Int32Ty, nullptr, "a_ptr"); AllocaInst *BPtr = Builder.CreateAlloca(Int32Ty, nullptr, "b_ptr"); @@ -6206,6 +6207,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) { Builder.CreateStore(Builder.getInt32(20), BPtr); auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) -> InsertPointTy { + IRBuilderBase::InsertPointGuard guard(Builder); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); Builder.restoreIP(CodeGenIP); LoadInst *AVal = Builder.CreateLoad(Int32Ty, APtr); LoadInst *BVal = Builder.CreateLoad(Int32Ty, BPtr); @@ -6223,6 +6226,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) { [&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal, llvm::OpenMPIRBuilder::InsertPointTy AllocaIP, llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) { + IRBuilderBase::InsertPointGuard guard(Builder); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); if (!OMPBuilder.Config.isTargetDevice()) { RetVal = cast(&Arg); return CodeGenIP; @@ -6269,6 +6274,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) { Builder.saveIP(), EntryInfo, DefaultAttrs, RuntimeAttrs, /*IfCond=*/nullptr, Inputs, GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB)); + EXPECT_EQ(DL, Builder.getCurrentDebugLocation()); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); @@ -6368,6 +6374,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { F->addFnAttr("target-features", "+gfx9-insts,+wavefrontsize64"); IRBuilder<> Builder(BB); OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + Builder.SetCurrentDebugLocation(DL); LoadInst *Value = nullptr; StoreInst *TargetStore = nullptr; @@ -6379,6 +6386,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { [&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal, llvm::OpenMPIRBuilder::InsertPointTy AllocaIP, llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) { + IRBuilderBase::InsertPointGuard guard(Builder); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); if (!OMPBuilder.Config.isTargetDevice()) { RetVal = cast(&Arg); return CodeGenIP; @@ -6412,6 +6421,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP) -> OpenMPIRBuilder::InsertPointTy { + IRBuilderBase::InsertPointGuard guard(Builder); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); Builder.restoreIP(CodeGenIP); Value = Builder.CreateLoad(Type::getInt32Ty(Ctx), CapturedArgs[0]); TargetStore = Builder.CreateStore(Value, CapturedArgs[1]); @@ -6433,6 +6444,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { EntryInfo, DefaultAttrs, RuntimeAttrs, /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB)); + EXPECT_EQ(DL, Builder.getCurrentDebugLocation()); Builder.restoreIP(AfterIP); Builder.CreateRetVoid(); @@ -6742,6 +6754,7 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { F->setName("func"); IRBuilder<> Builder(BB); OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + Builder.SetCurrentDebugLocation(DL); LoadInst *Value = nullptr; StoreInst *TargetStore = nullptr; @@ -6752,6 +6765,8 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { [&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal, llvm::OpenMPIRBuilder::InsertPointTy AllocaIP, llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) { + IRBuilderBase::InsertPointGuard guard(Builder); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); if (!OMPBuilder.Config.isTargetDevice()) { RetVal = cast(&Arg); return CodeGenIP; @@ -6787,6 +6802,8 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP) -> OpenMPIRBuilder::InsertPointTy { + IRBuilderBase::InsertPointGuard guard(Builder); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); Builder.restoreIP(CodeGenIP); RaiseAlloca = Builder.CreateAlloca(Builder.getInt32Ty()); Value = Builder.CreateLoad(Type::getInt32Ty(Ctx), CapturedArgs[0]); @@ -6809,6 +6826,7 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { EntryInfo, DefaultAttrs, RuntimeAttrs, /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB)); + EXPECT_EQ(DL, Builder.getCurrentDebugLocation()); Builder.restoreIP(AfterIP); Builder.CreateRetVoid(); @@ -7662,4 +7680,20 @@ TEST_F(OpenMPIRBuilderTest, createGPUOffloadEntry) { EXPECT_TRUE(Fn->hasFnAttribute(Attribute::MustProgress)); } +TEST_F(OpenMPIRBuilderTest, splitBB) { + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.Config.IsTargetDevice = false; + OMPBuilder.initialize(); + F->setName("func"); + IRBuilder<> Builder(BB); + + Builder.SetCurrentDebugLocation(DL); + AllocaInst *alloc = Builder.CreateAlloca(Builder.getInt32Ty()); + EXPECT_TRUE(DL == alloc->getStableDebugLoc()); + BasicBlock *AllocaBB = Builder.GetInsertBlock(); + splitBB(Builder, /*CreateBranch=*/true, "test"); + if (AllocaBB->getTerminator()) + EXPECT_TRUE(DL == AllocaBB->getTerminator()->getStableDebugLoc()); +} + } // namespace diff --git a/llvm/unittests/Object/CMakeLists.txt b/llvm/unittests/Object/CMakeLists.txt index 81bc4a5577e68..1343352d1dc69 100644 --- a/llvm/unittests/Object/CMakeLists.txt +++ b/llvm/unittests/Object/CMakeLists.txt @@ -16,6 +16,7 @@ add_llvm_unittest(ObjectTests MinidumpTest.cpp ObjectFileTest.cpp OffloadingTest.cpp + OffloadingBundleTest.cpp SymbolSizeTest.cpp SymbolicFileTest.cpp XCOFFObjectFileTest.cpp diff --git a/llvm/unittests/Object/OffloadingBundleTest.cpp b/llvm/unittests/Object/OffloadingBundleTest.cpp new file mode 100644 index 0000000000000..a4d6986e3c0a7 --- /dev/null +++ b/llvm/unittests/Object/OffloadingBundleTest.cpp @@ -0,0 +1,85 @@ +// Skip running on Windows. +#if !defined(_WIN32) + +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/OffloadBinary.h" +#include "llvm/Object/OffloadBundle.h" +#include "llvm/ObjectYAML/yaml2obj.h" +#include "llvm/Support/FileOutputBuffer.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/YAMLTraits.h" +#include "llvm/Testing/Support/Error.h" +#include "gtest/gtest.h" +#include + +using namespace llvm; +using namespace llvm::object; + +StringRef simpleAdd = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 + Entry: 0x2041B0 +Sections: + - Name: .hip_fatbin + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x201000 + AddressAlign: 0x1000 + Content: 5F5F434C414E475F4F46464C4F41445F42554E444C455F5F0200000000000000001000000000000000000000000000001B00000000000000686F73742D7838365F36342D756E6B6E6F776E2D6C696E75782D2D0010000000000000D00F0000000000001F0000000000000068697076342D616D6467636E2D616D642D616D646873612D2D676678393038000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000007F454C460201014003000000000000000300E0000100000000000000000000004000000000000000100C0000000000003005000040003800090040000F000D000600000004000000400000000000000040000000000000004000000000000000F801000000000000F80100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000C008000000000000C008000000000000001000000000000001000000050000000009000000000000001900000000000000190000000000006C000000000000006C00000000000000001000000000000001000000060000007009000000000000702900000000000070290000000000007000000000000000900600000000000000100000000000000100000006000000E009000000000000E039000000000000E039000000000000000000000000000001000000000000000010000000000000020000000600000070090000000000007029000000000000702900000000000070000000000000007000000000000000080000000000000052E574640400000070090000000000007029000000000000702900000000000070000000000000009006000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000400000004000000380200000000000038020000000000003802000000000000340500000000000034050000000000000400000000000000070000001D05000020000000414D44475055000083AE616D646873612E6B65726E656C7391DE0012AB2E616770725F636F756E7400A52E61726773DC001085AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA415F642E636F65726365A72E6F666673657400A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657285AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA425F642E636F65726365A72E6F666673657408A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657284A52E6E616D65A14EA72E6F666673657410A52E73697A6508AB2E76616C75655F6B696E64A862795F76616C756583A72E6F666673657418A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7883A72E6F66667365741CA52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7983A72E6F666673657420A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7A83A72E6F666673657424A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7883A72E6F666673657426A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7983A72E6F666673657428A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7A83A72E6F66667365742AA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7883A72E6F66667365742CA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7983A72E6F66667365742EA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7A83A72E6F666673657440A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7883A72E6F666673657448A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7983A72E6F666673657450A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7A83A72E6F666673657458A52E73697A6502AB2E76616C75655F6B696E64B068696464656E5F677269645F64696D73B92E67726F75705F7365676D656E745F66697865645F73697A6500B62E6B65726E6172675F7365676D656E745F616C69676E08B52E6B65726E6172675F7365676D656E745F73697A65CD0118A92E6C616E6775616765A84F70656E434C2043B12E6C616E67756167655F76657273696F6E920200B82E6D61785F666C61745F776F726B67726F75705F73697A65CD0400A52E6E616D65B25F5A3973696D706C65416464506A504B6A6DBB2E707269766174655F7365676D656E745F66697865645F73697A6500AB2E736770725F636F756E740CB12E736770725F7370696C6C5F636F756E7400A72E73796D626F6CB55F5A3973696D706C65416464506A504B6A6D2E6B64B82E756E69666F726D5F776F726B5F67726F75705F73697A6501B32E757365735F64796E616D69635F737461636BC2AB2E766770725F636F756E7404B12E766770725F7370696C6C5F636F756E7400AF2E7761766566726F6E745F73697A6540AD616D646873612E746172676574B9616D6467636E2D616D642D616D646873612D2D676678393038AE616D646873612E76657273696F6E92010200000000000000000000000000000000000000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E03900000000000001000000000000000100000001000000010000001A000000000008400000D20001000000360A4A7A5238A4D3F113F4DD04000000040000000200000001000000000000000300000000000000000000000000000000000000005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F623730363264386333326134613933330000000000000000000000000000000000000000000000000000000000000000000000180100000000000080100000000000000000000000000000000000000000000000000000000000004000AF008C000000090000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000C20102C02400000002000AC0000000008002027E7FC08CBF07FF0486FFFF0000060406920600006800008FD2820002000302067E0200043203030638008050DC02007F020102067E0000003203030238008050DC00007F03700F8CBF03050468008070DC00027F00000081BF00000000060000000000000070070000000000000B000000000000001800000000000000050000000000000020080000000000000A000000000000004600000000000000F5FEFF6F00000000D0070000000000000400000000000000F807000000000000000000000000000000000000000000004C696E6B65723A20414D44204C4C442031392E302E3000414D4420636C616E672076657273696F6E2031392E302E306769742028202032343231322063393630313665636534313337356462646438663037356266333762643666633333323230376233290000414D4420636C616E672076657273696F6E2031382E302E3067697420287373683A2F2F6765727269746769742F6C696768746E696E672F65632F6C6C766D2D70726F6A65637420616D642D6D61696E6C696E652D6F70656E20323431373620663935303039613166393032313232343865313036333964653837653635636163616338643961372900000000000000000000000000000000000000000000000000460000000002080070290000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E0390000000000000100000000000000002E6E6F7465002E64796E73796D002E676E752E68617368002E68617368002E64796E737472002E726F64617461002E74657874002E64796E616D6963002E72656C726F5F70616464696E67002E627373002E636F6D6D656E74002E73796D746162002E7368737472746162002E73747274616200005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F62373036326438633332613461393333005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000070000000200000000000000380200000000000038020000000000003405000000000000000000000000000004000000000000000000000000000000070000000B00000002000000000000007007000000000000700700000000000060000000000000000500000001000000080000000000000018000000000000000F000000F6FFFF6F0200000000000000D007000000000000D007000000000000280000000000000002000000000000000800000000000000000000000000000019000000050000000200000000000000F807000000000000F80700000000000028000000000000000200000000000000040000000000000004000000000000001F000000030000000200000000000000200800000000000020080000000000004600000000000000000000000000000001000000000000000000000000000000270000000100000002000000000000008008000000000000800800000000000040000000000000000000000000000000400000000000000000000000000000002F000000010000000600000000000000001900000000000000090000000000006C00000000000000000000000000000000010000000000000000000000000000350000000600000003000000000000007029000000000000700900000000000070000000000000000500000000000000080000000000000010000000000000003E000000080000000300000000000000E029000000000000E00900000000000020060000000000000000000000000000010000000000000000000000000000004D000000080000000300000000000000E039000000000000E0090000000000000100000000000000000000000000000001000000000000000000000000000000520000000100000030000000000000000000000000000000E009000000000000F0000000000000000000000000000000010000000000000001000000000000005B0000000200000000000000000000000000000000000000D00A00000000000078000000000000000E0000000200000008000000000000001800000000000000630000000300000000000000000000000000000000000000480B00000000000075000000000000000000000000000000010000000000000000000000000000006D0000000300000000000000000000000000000000000000BD0B0000000000004F00000000000000000000000000000001000000000000000000000000000000 + - Name: .hipFatBinSegment + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x202FD0 + AddressAlign: 0x8 + Content: '465049480100000000102000000000000000000000000000' +)"; + +// ELF Object creation +static Expected> +toBinary(SmallVectorImpl &Storage, StringRef Yaml) { + raw_svector_ostream OS(Storage); + yaml::Input YIn(Yaml); + if (!yaml::convertYAML(YIn, OS, [](const Twine &Msg) {})) + return createStringError(std::errc::invalid_argument, + "unable to convert YAML"); + return object::ObjectFile::createELFObjectFile( + MemoryBufferRef(OS.str(), "dummyELF")); +} + +TEST(OffloadingBundleTest, checkExtractOffloadBundleFatBinary) { + + // create a Memory Buffer with a fatbin offloading section + SmallVector(); + SmallString<0> Storage; + // Expected> ObjOrErr = toBinary(Storage, R"( + Expected> ObjOrErr = toBinary(Storage, simpleAdd); + + ASSERT_THAT_EXPECTED(ObjOrErr, Succeeded()); + + SmallVector Bundles; + Error Err = extractOffloadBundleFatBinary(**ObjOrErr, Bundles); + EXPECT_FALSE(errorToBool(std::move(Err))); +} + +TEST(OffloadingBundleTest, checkExtractCodeObject) { + // create a Memory Buffer with a fatbin offloading section + SmallVector(); + SmallString<0> Storage; + // Expected> ObjOrErr = toBinary(Storage, R"( + Expected> ObjOrErr = toBinary(Storage, simpleAdd); + + ASSERT_THAT_EXPECTED(ObjOrErr, Succeeded()); + + int64_t Offset = 8192; + int64_t Size = 4048; + + Error Err = extractCodeObject(**ObjOrErr, Offset, Size, + StringRef("checkExtractCodeObject.co")); + EXPECT_FALSE(errorToBool(std::move(Err))); +} + +#endif diff --git a/llvm/utils/lit/lit/llvm/config.py b/llvm/utils/lit/lit/llvm/config.py index 5f762ec7f3514..fe6b70bc96237 100644 --- a/llvm/utils/lit/lit/llvm/config.py +++ b/llvm/utils/lit/lit/llvm/config.py @@ -349,6 +349,14 @@ def get_clang_has_lsan(self, clang, triple): return False + # Normalize 3-field target triple to 4-field triple with "unknown" as environment + def normalize_triple(self, triple): + compoments = triple.split("-", maxsplit=3) + if len(compoments) == 4: + return triple + assert len(compoments) == 3 + return triple + "-unknown" + def make_itanium_abi_triple(self, triple): m = re.match(r"(\w+)-(\w+)-(\w+)", triple) if not m: @@ -659,7 +667,9 @@ def use_clang( self.config.substitutions.append( ( "%itanium_abi_triple", - self.make_itanium_abi_triple(self.config.target_triple), + self.normalize_triple( + self.make_itanium_abi_triple(self.config.target_triple) + ), ) ) self.config.substitutions.append( diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp index 87782e84dd6e4..7eb59fdc532c0 100644 --- a/mlir/lib/IR/MLIRContext.cpp +++ b/mlir/lib/IR/MLIRContext.cpp @@ -59,7 +59,8 @@ struct MLIRContextOptions { llvm::cl::opt disableThreading{ "mlir-disable-threading", llvm::cl::desc("Disable multi-threading within MLIR, overrides any " - "further call to MLIRContext::enableMultiThreading()")}; + "further call to MLIRContext::enableMultiThreading()"), + llvm::cl::init(true)}; llvm::cl::opt printOpOnDiagnostic{ "mlir-print-op-on-diagnostic", @@ -77,7 +78,7 @@ struct MLIRContextOptions { static llvm::ManagedStatic clOptions; static bool isThreadingGloballyDisabled() { -#if LLVM_ENABLE_THREADS != 0 +#if MLIR_ENABLE_THREADS != 0 return clOptions.isConstructed() && clOptions->disableThreading; #else return true; diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 6905fe9987999..3a180bc4d3201 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1544,7 +1544,8 @@ allocatePrivateVars(llvm::IRBuilderBase &builder, llvm::cast(allocaIP.getBlock()->getTerminator()); splitBB(llvm::OpenMPIRBuilder::InsertPointTy(allocaIP.getBlock(), allocaTerminator->getIterator()), - true, "omp.region.after_alloca"); + true, allocaTerminator->getStableDebugLoc(), + "omp.region.after_alloca"); llvm::IRBuilderBase::InsertPointGuard guard(builder); // Update the allocaTerminator in case the alloca block was split above. @@ -4158,6 +4159,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, } break; case BodyGenTy::DupNoPriv: + // We must always restoreIP regardless of doing anything the caller + // does not restore it, leading to incorrect (no) branch generation. + builder.restoreIP(codeGenIP); break; case BodyGenTy::NoPriv: // If device info is available then region has already been generated @@ -4933,6 +4937,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) -> llvm::OpenMPIRBuilder::InsertPointOrErrorTy { + llvm::IRBuilderBase::InsertPointGuard guard(builder); + builder.SetCurrentDebugLocation(llvm::DebugLoc()); // Forward target-cpu and target-features function attributes from the // original function to the new outlined function. llvm::Function *llvmParentFn = @@ -5028,6 +5034,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, llvm::Value *&retVal, InsertPointTy allocaIP, InsertPointTy codeGenIP) -> llvm::OpenMPIRBuilder::InsertPointOrErrorTy { + llvm::IRBuilderBase::InsertPointGuard guard(builder); + builder.SetCurrentDebugLocation(llvm::DebugLoc()); // We just return the unaltered argument for the host function // for now, some alterations may be required in the future to // keep host fallback functions working identically to the device diff --git a/mlir/test/Pass/invalid-pass.mlir b/mlir/test/Pass/invalid-pass.mlir index 649f723aa8f72..765b6313d64ef 100644 --- a/mlir/test/Pass/invalid-pass.mlir +++ b/mlir/test/Pass/invalid-pass.mlir @@ -1,6 +1,6 @@ // RUN: not mlir-opt %s -pass-pipeline='builtin.module(builtin.module(test-module-pass{test-option=a}))' 2>&1 | FileCheck %s -// RUN: not mlir-opt %s -mlir-print-ir-module-scope -mlir-print-ir-before=cse 2>&1 | FileCheck -check-prefix=PRINT_MODULE_IR_WITH_MULTITHREAD %s - +// RUN: not mlir-opt %s -mlir-disable-threading=0 -mlir-print-ir-module-scope -mlir-print-ir-before=cse 2>&1 | FileCheck -check-prefix=PRINT_MODULE_IR_WITH_MULTITHREAD %s +// XFAIL: * // CHECK: : no such option test-option // CHECK: failed to add `test-module-pass` with options `test-option=a` // CHECK: failed to add `builtin.module` with options `` to inner pipeline diff --git a/mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir b/mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir new file mode 100644 index 0000000000000..f17fd96df00f3 --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir @@ -0,0 +1,67 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s +// REQUIRES: downstream_stability + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true} { + omp.private {type = private} @_QFEj_private_i32 : i32 loc(#loc1) + omp.private {type = private} @_QFEi_private_i32 : i32 loc(#loc1) + llvm.func @test() { + %3 = llvm.mlir.constant(1 : i64) : i64 + %4 = llvm.alloca %3 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr<5> loc(#loc4) + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr loc(#loc4) + %6 = llvm.mlir.constant(1 : i64) : i64 + %7 = llvm.alloca %6 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr<5> loc(#loc4) + %8 = llvm.addrspacecast %7 : !llvm.ptr<5> to !llvm.ptr + %9 = llvm.mlir.constant(16383 : index) : i64 + %10 = llvm.mlir.constant(0 : index) : i64 + %11 = llvm.mlir.constant(1 : index) : i64 + %12 = llvm.mlir.constant(16384 : i32) : i32 + %14 = llvm.mlir.addressof @_QFEarray : !llvm.ptr + %18 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"} loc(#loc3) + %20 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "j"} loc(#loc3) + %22 = omp.map.bounds lower_bound(%10 : i64) upper_bound(%9 : i64) extent(%9 : i64) stride(%11 : i64) start_idx(%11 : i64) loc(#loc3) + %23 = omp.map.info var_ptr(%14 : !llvm.ptr, !llvm.array<16384 x i32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%22) -> !llvm.ptr {name = "array"} loc(#loc3) + %24 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"} loc(#loc3) + omp.target map_entries(%18 -> %arg0, %20 -> %arg2, %23 -> %arg4, %24 -> %arg5 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) { + %25 = llvm.mlir.constant(1 : i32) : i32 + %27 = llvm.mlir.constant(16384 : i32) : i32 + omp.teams { + omp.distribute private(@_QFEi_private_i32 %arg5 -> %arg6 : !llvm.ptr) { + omp.loop_nest (%arg7) : i32 = (%25) to (%27) inclusive step (%25) { + omp.parallel { + omp.wsloop private(@_QFEj_private_i32 %arg2 -> %arg8 : !llvm.ptr) { + omp.loop_nest (%arg9) : i32 = (%25) to (%27) inclusive step (%25) { + llvm.store %arg9, %arg8 : i32, !llvm.ptr loc(#loc9) + omp.yield + } loc(#loc9) + } loc(#loc9) + omp.terminator loc(#loc9) + } loc(#loc9) + omp.yield loc(#loc9) + } loc(#loc9) + } loc(#loc9) + omp.terminator loc(#loc9) + } loc(#loc9) + omp.terminator loc(#loc9) + } loc(#loc9) + llvm.return loc(#loc9) + } loc(#loc14) + llvm.mlir.global internal @_QFEarray() {addr_space = 0 : i32} : !llvm.array<16384 x i32> { + %0 = llvm.mlir.zero : !llvm.array<16384 x i32> + llvm.return %0 : !llvm.array<16384 x i32> + } loc(#loc2) +} +#di_file = #llvm.di_file<"test.f90" in ""> +#di_null_type = #llvm.di_null_type +#loc1 = loc("test.f90":4:23) +#loc2 = loc("test.f90":4:15) +#loc3 = loc("test.f90":1:7) +#loc4 = loc("test.f90":4:18) +#loc9 = loc("test.f90":13:11) +#di_compile_unit = #llvm.di_compile_unit, sourceLanguage = DW_LANG_Fortran95, file = #di_file, producer = "flang", isOptimized = true, emissionKind = LineTablesOnly> +#di_subroutine_type = #llvm.di_subroutine_type +#di_subprogram = #llvm.di_subprogram, compileUnit = #di_compile_unit, scope = #di_file, name = "main", file = #di_file, subprogramFlags = "Definition|Optimized|MainSubprogram", type = #di_subroutine_type> +#loc14 = loc(fused<#di_subprogram>[#loc3]) + + +// CHECK: call void @__kmpc_distribute_static{{.*}}!dbg + diff --git a/mlir/test/Target/LLVMIR/omptarget-debug-nowait.mlir b/mlir/test/Target/LLVMIR/omptarget-debug-nowait.mlir new file mode 100644 index 0000000000000..eaa88d9dd6053 --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-debug-nowait.mlir @@ -0,0 +1,40 @@ +// RUN: mlir-translate -mlir-to-llvmir %s + +module attributes {omp.is_target_device = false} { + llvm.func @main() { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x f32 : (i64) -> !llvm.ptr + %3 = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr + %6 = omp.map.info var_ptr(%1 : !llvm.ptr, f32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr + %7 = omp.map.info var_ptr(%3 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr + omp.target nowait map_entries(%6 -> %arg0, %7 -> %arg1 : !llvm.ptr, !llvm.ptr) { + %8 = llvm.mlir.constant(0 : i64) : i64 + %9 = llvm.mlir.constant(100 : i32) : i32 + llvm.br ^bb1(%9, %8 : i32, i64) + ^bb1(%13: i32, %14: i64): // 2 preds: ^bb0, ^bb2 + %15 = llvm.icmp "sgt" %14, %8 : i64 + llvm.cond_br %15, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + llvm.store %13, %arg1 : i32, !llvm.ptr + llvm.br ^bb1(%13, %14 : i32, i64) + ^bb3: // pred: ^bb1 + llvm.store %13, %arg1 : i32, !llvm.ptr + omp.terminator + } + llvm.return + } loc(#loc2) +} + +#file = #llvm.di_file<"test.f90" in ""> +#di_null_type = #llvm.di_null_type +#cu = #llvm.di_compile_unit, + sourceLanguage = DW_LANG_Fortran95, file = #file, isOptimized = false, + emissionKind = Full> +#sp_ty = #llvm.di_subroutine_type +#sp = #llvm.di_subprogram + +#loc1 = loc("test.f90":6:7) +#loc2 = loc(fused<#sp>[#loc1]) + diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index 1dfef57a2285c..df9116396a138 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -901,7 +901,6 @@ template class StaticLoopChunker { Ty ThreadChunk = 0; Ty NumThreads = 1; Ty TId = 0; - ASSERT(TId == mapping::getThreadIdInBlock(), "Bad thread id"); // All teams need to participate. Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); diff --git a/offload/EnableOffloadRuntime b/offload/EnableOffloadRuntime new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/offload/hostexec/CMakeLists.txt b/offload/hostexec/CMakeLists.txt index 4b6e5f81e1e1a..69d8d2201e181 100644 --- a/offload/hostexec/CMakeLists.txt +++ b/offload/hostexec/CMakeLists.txt @@ -10,7 +10,7 @@ # ##===----------------------------------------------------------------------===## -cmake_minimum_required(VERSION 3.0 FATAL_ERROR) +cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR) if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") message(FATAL_ERROR "Direct configuration not supported, please use parent directory!") @@ -126,6 +126,29 @@ else() return() endif() +set(amdgpu_mcpus gfx700 gfx701 gfx801 gfx803 gfx900 gfx902 gfx906 gfx908 gfx90a gfx90c gfx940 gfx941 gfx942 gfx950 gfx1010 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1153 gfx1200 gfx1201) +if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST) + set(amdgpu_mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST}) +endif() + +set(all_capabilities 35 37 50 52 53 60 61 62 70 72 75 80 86 89 90) +set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${all_capabilities} CACHE STRING + "List of CUDA Compute Capabilities to be used to compile the NVPTX DeviceRTL.") +string(TOLOWER ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES} LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES) +if (LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "all") + set(nvptx_sm_list ${all_capabilities}) +elseif(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "auto") + if (NOT LIBOMPTARGET_DEP_CUDA_FOUND) + libomptarget_error_say("[NVPTX] Cannot auto detect compute capability as CUDA not found.") + endif() + set(nvptx_sm_list ${LIBOMPTARGET_DEP_CUDA_ARCH}) +else() + string(REPLACE "," ";" nvptx_sm_list "${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}") +endif() +foreach(sm ${nvptx_sm_list}) + list(APPEND nvptx_mcpus "sm_${sm}") +endforeach() + set(ocl_atomics_cl_filename ${CMAKE_CURRENT_SOURCE_DIR}/src/oclAtomics.cl) set(invoke_cpp_file_name ${CMAKE_CURRENT_SOURCE_DIR}/src/hostexec_invoke.cpp) set(hostexec_stubs_filename ${CMAKE_CURRENT_SOURCE_DIR}/src/hostexec_stubs.cpp) diff --git a/offload/include/OpenMP/OMPT/OmptTracingBuffer.h b/offload/include/OpenMP/OMPT/OmptTracingBuffer.h index 93fca1c256f89..088e9d2328ee3 100644 --- a/offload/include/OpenMP/OMPT/OmptTracingBuffer.h +++ b/offload/include/OpenMP/OMPT/OmptTracingBuffer.h @@ -31,6 +31,8 @@ #include +#include "Shared/EnvironmentVar.h" + // Maximum number of devices supported in device tracing. No device tracing // will be performed for any device-id larger than 1023. #define MAX_NUM_DEVICES 1024 @@ -102,6 +104,12 @@ class OmptTracingBufferMgr { using BufPtr = std::shared_ptr; private: + /// Envar to control whether a buffer should be flushed when it gets full. + BoolEnvar OMPX_FlushOnBufferFull; + + /// Envar to control whether all buffers should be flushed during shutdown. + BoolEnvar OMPX_FlushOnShutdown; + // Internal variable for tracking threads to wait for flush uint32_t ThreadFlushTracker; @@ -216,8 +224,8 @@ class OmptTracingBufferMgr { /// Called when a buffer \p Buf may be flushed with \p Cursor as the /// last allocated trace record in the buffer. - /// setComplete should be called without holding any lock. - void setComplete(void *Cursor, BufPtr Buf); + /// triggerFlushOnBufferFull should be called without holding any lock. + void triggerFlushOnBufferFull(void *Cursor, BufPtr Buf); // Called to dispatch buffer-completion callbacks for the trace records in // this buffer @@ -350,7 +358,14 @@ class OmptTracingBufferMgr { void destroyHelperThreads(); public: - OmptTracingBufferMgr(); + OmptTracingBufferMgr() + : OMPX_FlushOnBufferFull("LIBOMPTARGET_OMPT_FLUSH_ON_BUFFER_FULL", true), + OMPX_FlushOnShutdown("LIBOMPTARGET_OMPT_FLUSH_ON_SHUTDOWN", true) { + // no need to hold locks for init() since object is getting constructed + // here. + init(); + } + OmptTracingBufferMgr(const OmptTracingBufferMgr &) = delete; OmptTracingBufferMgr &operator=(const OmptTracingBufferMgr &) = delete; diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 06efc96de1a89..dbe53082098ee 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1904,6 +1904,10 @@ struct AMDGPUStreamTy { /// Use synchronous copy back. bool UseSyncCopyBack; + /// When copying data from one host buffer to another, only do it + /// asynchronously if `MinHostToHostAsyncCopySize <= size`. + UInt32Envar OMPX_MinHostToHostAsyncCopySize; + /// Arguments for the callback function. PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs; @@ -2306,6 +2310,14 @@ struct AMDGPUStreamTy { return Err; } + if (CopySize < OMPX_MinHostToHostAsyncCopySize) { + if (auto Err = + OutputSignals[0]->wait(StreamBusyWaitMicroseconds, &Device)) + return Err; + std::memcpy(Dst, Inter, CopySize); + return Error::success(); + } + // Consume another stream slot and compute dependencies. std::tie(Curr, InputSignal) = consume(OutputSignals[1]); assert(InputSignal && "Invalid input signal"); @@ -2904,14 +2916,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}), AMDGenericDeviceTy(), OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4), OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512), - OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 6), + OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4), OMPX_GenericSpmdTeamsPerCU( - "LIBOMPTARGET_AMDGPU_GENERIC_SPMD_TEAMS_PER_CU", 0), + "LIBOMPTARGET_AMDGPU_GENERIC_SPMD_TEAMS_PER_CU", 6), OMPX_BigJumpLoopTeamsPerCU( "LIBOMPTARGET_AMDGPU_BIG_JUMP_LOOP_TEAMS_PER_CU", 0), OMPX_BigJumpLoopMaxTotalTeams( "LIBOMPTARGET_AMDGPU_BIG_JUMP_LOOP_MAX_TOTAL_TEAMS", 1024 * 1024), - OMPX_LowTripCount("LIBOMPTARGET_AMDGPU_LOW_TRIPCOUNT", 4000), + OMPX_LowTripCount("LIBOMPTARGET_AMDGPU_LOW_TRIPCOUNT", 9000), OMPX_SmallBlockSize("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32), OMPX_NumBlocksForLowTripcount("LIBOMPTARGET_BLOCKS_FOR_LOW_TRIP_COUNT", 0), @@ -2929,7 +2941,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000), OMPX_UseMultipleSdmaEngines( // setting default to true here appears to solve random sdma problem - "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false), + "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", true), OMPX_ApuMaps("OMPX_APU_MAPS", false), OMPX_EnableGFX90ACoarseGrainUsmMaps( "OMPX_ENABLE_GFX90A_COARSE_GRAIN_USM_MAPS", false), @@ -2942,7 +2954,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { 1 * 1024 * 1024), // 1MB OMPX_DGPUMaps("OMPX_DGPU_MAPS", false), OMPX_SharedDescriptorMaxSize("LIBOMPTARGET_SHARED_DESCRIPTOR_MAX_SIZE", - 48), + 96), + OMPX_EnableDevice2DeviceMemAccess( + "OMPX_ENABLE_DEVICE_TO_DEVICE_MEM_ACCESS", false), AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {} @@ -4376,8 +4390,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { // If this value needs to go above UINT_MAX, consider // adding sizeof(size_t) check to avoid unpleasant truncation // surprises where size_t is still 32bit. - constexpr size_t Almost2Gig = 2000000000u; - return Almost2Gig; + constexpr size_t Almost3Gig = 3000000000u; + return Almost3Gig; } return 0; } @@ -4539,6 +4553,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// memory. Default value is 48. UInt32Envar OMPX_SharedDescriptorMaxSize; + // Determines whether we call HSA API, upon device memory allocation, + // for making the memory acceccible from other agents. + // Default is disabled + BoolEnvar OMPX_EnableDevice2DeviceMemAccess; + /// Stream manager for AMDGPU streams. AMDGPUStreamManagerTy AMDGPUStreamManager; @@ -4713,7 +4732,9 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device) Slots(32), NextSlot(0), SyncCycle(0), StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()), UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()), - UseSyncCopyBack(Device.syncCopyBack()) {} + UseSyncCopyBack(Device.syncCopyBack()), + OMPX_MinHostToHostAsyncCopySize( + "LIBOMPTARGET_AMDGPU_MIN_HOST_TO_HOST_ASYNC_COPY_SIZE", 2048) {} /// Class implementing the AMDGPU-specific functionalities of the global /// handler. @@ -5327,7 +5348,8 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { } } - if (Alloc) { + if (Alloc && (Kind == TARGET_ALLOC_HOST || Kind == TARGET_ALLOC_SHARED || + OMPX_EnableDevice2DeviceMemAccess)) { // Get a list of agents that can access this memory pool. Inherently // necessary for host or shared allocations Also enabled for device memory // to allow device to device memcpy diff --git a/offload/src/OpenMP/OMPT/OmptTracingBuffer.cpp b/offload/src/OpenMP/OMPT/OmptTracingBuffer.cpp index cebae6982feab..a39d877ec431e 100644 --- a/offload/src/OpenMP/OMPT/OmptTracingBuffer.cpp +++ b/offload/src/OpenMP/OMPT/OmptTracingBuffer.cpp @@ -129,8 +129,8 @@ void *OmptTracingBufferMgr::assignCursor(ompt_callbacks_t Type, lck.unlock(); // Schedule the full buffer for flushing till the corresponding cursor. - if (ToBeFlushedCursor) - setComplete(ToBeFlushedCursor, ToBeFlushedBuf); + if (OMPX_FlushOnBufferFull && ToBeFlushedCursor) + triggerFlushOnBufferFull(ToBeFlushedCursor, ToBeFlushedBuf); DP("Thread %lu: Assigned %lu bytes at %p in new buffer with id %lu for " "device %ld\n", @@ -148,10 +148,10 @@ void *OmptTracingBufferMgr::assignCursor(ompt_callbacks_t Type, * called without holding any lock. * Note lock order: buf_lock -> flush_lock */ -void OmptTracingBufferMgr::setComplete(void *cursor, BufPtr Buf) { +void OmptTracingBufferMgr::triggerFlushOnBufferFull(void *cursor, BufPtr Buf) { std::unique_lock buf_lock(BufferMgrMutex); - // Between calling setComplete and this check, a flush-all may have + // Between calling this function and this check, a flush-all may have // delivered this buffer to the tool and deleted it. So the buffer // may not exist. if (Id2BufferMap.find(Buf->Id) == Id2BufferMap.end()) @@ -507,6 +507,11 @@ OmptTracingBufferMgr::findAndReserveFlushedBuf(uint64_t flush_id) { } assert(flush_itr->second.FlushStatus == Flush_waiting); flush_itr->second.FlushStatus = Flush_processing; + // Update the metadata cursor since more trace records may have been + // generated. + flush_itr->second.FlushCursor = + flush_itr->second.FlushBuf->Cursor.load(std::memory_order_acquire); + FlushInfo flush_info(flush_itr->first, flush_itr->second.FlushCursor, flush_itr->second.FlushBuf); DP("Reserved buffer: flush_id:%lu, cursor:%p, buf:%p\n", flush_itr->first, @@ -724,7 +729,8 @@ void OmptTracingBufferMgr::shutdownHelperThreads() { void OmptTracingBufferMgr::flushAndShutdownHelperThreads() { std::unique_lock Lock(llvm::omp::target::ompt::TraceControlMutex); // Flush buffers for all devices. - flushAllBuffers(MAX_NUM_DEVICES); + if (OMPX_FlushOnShutdown) + flushAllBuffers(MAX_NUM_DEVICES); shutdownHelperThreads(); } @@ -742,9 +748,4 @@ void OmptTracingBufferMgr::destroyHelperThreads() { CompletionThreads.clear(); HelperThreadIdMap.clear(); } - -OmptTracingBufferMgr::OmptTracingBufferMgr() { - // no need to hold locks for init() since object is getting constructed here - init(); -} #endif diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 855e09876a41a..4d80129e35b25 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -134,7 +134,8 @@ elif config.libomptarget_current_target.startswith('amdgcn'): # only check the first one assuming that we will run the test on it. if not (config.amdgpu_test_arch.startswith("gfx90a") or config.amdgpu_test_arch.startswith("gfx940") or - config.amdgpu_test_arch.startswith("gfx942")): + config.amdgpu_test_arch.startswith("gfx942") or + config.amdgpu_test_arch.startswith("gfx950")): supports_unified_shared_memory = False # check if AMD architecture is an APU: if (config.amdgpu_test_arch.startswith("gfx940") or diff --git a/offload/test/offloading/fortran/target-data-map-if-present.f90 b/offload/test/offloading/fortran/target-data-map-if-present.f90 new file mode 100644 index 0000000000000..c181573cd7a1c --- /dev/null +++ b/offload/test/offloading/fortran/target-data-map-if-present.f90 @@ -0,0 +1,29 @@ +! Offloading test that tests that if(present(a)) compiles and executes without +! causing any compilation errors, primarily a regression test that does not +! yield interesting results. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +module mod + implicit none +contains + subroutine routine(a) + implicit none + real, dimension(:), optional :: a + integer :: i + !$omp target data if(present(a)) map(alloc:a) + do i = 1, 10 + a(i) = i + end do + !$omp end target data + end subroutine routine +end module mod + +program main + use mod + real :: a(10) + call routine(a) + print *, a +end program main + +! CHECK: 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. diff --git a/offload/test/offloading/fortran/target-teams-dist-nest-par.f90 b/offload/test/offloading/fortran/target-teams-dist-nest-par.f90 new file mode 100644 index 0000000000000..dfde1b98f3c86 --- /dev/null +++ b/offload/test/offloading/fortran/target-teams-dist-nest-par.f90 @@ -0,0 +1,26 @@ +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-generic +! RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic +program main + integer :: array(10) = 0 + integer :: x, y, z + !$omp target + !$omp teams distribute private(x, y) + OuterLoopOne: do x=1,1 + array(2) = 42 + OuterLoopTwo: do y=1,1 + !$omp parallel do private(z) + InnerLoopOne: do z=1,10 + array(z) = 20 + enddo InnerLoopOne + !$omp end parallel do + enddo OuterLoopTwo + enddo OuterLoopOne + !$omp end teams distribute + !$omp end target + ! Expected to print all 20's + print *, array +end program main + +! CHECK: 20 20 20 20 20 20 20 20 20 20 diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt index e6515c65f797a..6d629b3e2b58d 100644 --- a/openmp/CMakeLists.txt +++ b/openmp/CMakeLists.txt @@ -68,6 +68,8 @@ else() set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}" CACHE STRING "Path where built OpenMP libraries should be installed.") endif() + set(OPENMP_INSTALL_CFGDIR "lib/cmake" CACHE STRING + "Path where OpenMP config should be installed") if (NOT MSVC) set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) @@ -123,6 +125,8 @@ if (OPENMP_STANDALONE_BUILD) set(OPENMP_LIBDIR_SUFFIX "/asan") set(OPENMP_INSTALL_LIBDIR "lib${OPENMP_LIBDIR_SUFFIX}") endif() + set(OPENMP_INSTALL_CFGDIR "${OPENMP_INSTALL_LIBDIR}/cmake" CACHE STRING + "Path where OpenMP config should be installed") endif() # Check and set up common compiler flags. diff --git a/openmp/EnableOpenmpRuntime b/openmp/EnableOpenmpRuntime new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/openmp/libompd/gdb-plugin/ompd/ompd_handles.py b/openmp/libompd/gdb-plugin/ompd/ompd_handles.py index 1929a92617415..da97a4086eee6 100644 --- a/openmp/libompd/gdb-plugin/ompd/ompd_handles.py +++ b/openmp/libompd/gdb-plugin/ompd/ompd_handles.py @@ -1,5 +1,4 @@ import ompdModule -import imp class ompd_parallel(object): diff --git a/openmp/runtime/openmp-config.cmake.in b/openmp/runtime/openmp-config.cmake.in new file mode 100644 index 0000000000000..20eb531c97dbf --- /dev/null +++ b/openmp/runtime/openmp-config.cmake.in @@ -0,0 +1,13 @@ +@PACKAGE_INIT@ + +# Partial path copied from build variable OPENMP_INSTALL_LIBDIR +set( openmp_LIB_DIR "@OPENMP_INSTALL_LIBDIR@" ) + +# Full path to libomp.so using PACKAGE_PREFIX_DIR and OPENMP_INSTALL_LIBDIR partial path. +set_and_check( openmp_LIB_INSTALL_DIR "@PACKAGE_OPENMP_INSTALL_LIBDIR@" ) + +# Full path to omp.h using PACKAGE_PREFIX and LIBOMP_HEADERS_INSTALL_PATH partial path. +set_and_check( openmp_INCLUDE_DIR "@PACKAGE_LIBOMP_HEADERS_INSTALL_PATH@" ) +set_and_check( openmp_INCLUDE_DIRS "${openmp_INCLUDE_DIR}" ) + +include( "${CMAKE_CURRENT_LIST_DIR}/openmpTargets.cmake" ) diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt index 7c5d399d4408c..72c1ba25d05f4 100644 --- a/openmp/runtime/src/CMakeLists.txt +++ b/openmp/runtime/src/CMakeLists.txt @@ -457,13 +457,28 @@ if(WIN32) \"${alias}${CMAKE_STATIC_LIBRARY_SUFFIX}\" WORKING_DIRECTORY \"${outdir}\")") endforeach() else() - if(${CMAKE_SYSTEM_NAME} MATCHES "AIX") install(FILES ${LIBOMP_LIBRARY_DIR}/libomp.a DESTINATION "${OPENMP_INSTALL_LIBDIR}" COMPONENT runtime) else() - install(TARGETS omp ${export_to_llvmexports} ${LIBOMP_INSTALL_KIND} DESTINATION "${OPENMP_INSTALL_LIBDIR}") + install(TARGETS omp ${export_to_llvmexports} EXPORT openmpTargets ${LIBOMP_INSTALL_KIND} + DESTINATION "${OPENMP_INSTALL_LIBDIR}") + install(EXPORT openmpTargets FILE openmpTargets.cmake NAMESPACE OpenMP:: + DESTINATION ${OPENMP_INSTALL_CFGDIR}/openmp) + target_include_directories(omp PUBLIC $) endif() + # Create cmake configuration files + include(CMakePackageConfigHelpers) + + configure_package_config_file( + ../openmp-config.cmake.in + openmp-config.cmake + INSTALL_DESTINATION ${OPENMP_INSTALL_CFGDIR}/openmp + PATH_VARS LIBOMP_HEADERS_INSTALL_PATH OPENMP_INSTALL_LIBDIR) + + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/openmp-config.cmake + DESTINATION ${OPENMP_INSTALL_CFGDIR}/openmp) + if(${LIBOMP_INSTALL_ALIASES}) # Create aliases (symlinks) of the library for backwards compatibility extend_path(outdir "${CMAKE_INSTALL_PREFIX}" "${OPENMP_INSTALL_LIBDIR}") diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var index 623f0b9c315bd..6884745f4240c 100644 --- a/openmp/runtime/src/include/ompx.h.var +++ b/openmp/runtime/src/include/ompx.h.var @@ -9,13 +9,21 @@ #ifndef __OMPX_H #define __OMPX_H -#ifdef __AMDGCN_WAVEFRONT_SIZE -#define __WARP_SIZE __AMDGCN_WAVEFRONT_SIZE -#else -#define __WARP_SIZE 32 +#if (defined(__NVPTX__) || defined(__AMDGPU__)) +#include +#define __OMPX_TARGET_IS_GPU #endif typedef unsigned long uint64_t; +typedef unsigned int uint32_t; + +static inline uint32_t __warpSize(void) { +#ifdef __OMPX_TARGET_IS_GPU + return __gpu_num_lanes(); +#else + __builtin_trap(); +#endif +} #ifdef __cplusplus extern "C" { @@ -212,7 +220,7 @@ static inline uint64_t ballot_sync(uint64_t mask, int pred) { ///{ #define _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(TYPE, TY) \ static inline TYPE shfl_down_sync(uint64_t mask, TYPE var, unsigned delta, \ - int width = __WARP_SIZE) { \ + int width = __warpSize()) { \ return ompx_shfl_down_sync_##TY(mask, var, delta, width); \ }